[llvm] 2f5a116 - AMDGPU: Expand casted f16 fmed3 pattern to fmin/fmax on gfx8
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue May 23 00:48:33 PDT 2023
Author: Matt Arsenault
Date: 2023-05-23T08:48:25+01:00
New Revision: 2f5a116cf70e4717bce20c3d9d4c7a41aa6d89ec
URL: https://github.com/llvm/llvm-project/commit/2f5a116cf70e4717bce20c3d9d4c7a41aa6d89ec
DIFF: https://github.com/llvm/llvm-project/commit/2f5a116cf70e4717bce20c3d9d4c7a41aa6d89ec.diff
LOG: AMDGPU: Expand casted f16 fmed3 pattern to fmin/fmax on gfx8
If we have legal f16 instructions but no f16 med3, we can save
one instruction by expanding out the min/max sequence compared
to casting to f32 and casting back.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 228609490863..934524b8e1ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1715,6 +1715,8 @@ def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
+def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">;
+
def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">;
def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e5a53f7f8c3b..ce657ed27152 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -84,7 +84,7 @@ def fp_minmax_to_clamp : GICombineRule<
def fmed3_intrinsic_to_clamp : GICombineRule<
(defs root:$fmed3, register_matchinfo:$matchinfo),
- (match (wip_match_opcode G_INTRINSIC):$fmed3,
+ (match (wip_match_opcode G_AMDGPU_FMED3):$fmed3,
[{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
(apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>;
@@ -112,9 +112,33 @@ def sign_extension_in_reg : GICombineRule<
[{ return PostLegalizerHelper.matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]),
(apply [{ PostLegalizerHelper.applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>;
-// Combines which should only apply on SI/VI
+
+let Predicates = [Has16BitInsts, NotHasMed3_16] in {
+// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
+// saves one instruction compared to the promotion.
+//
+// FIXME: Should have ComplexPattern like in/out matchers
+//
+// FIXME: We should be able to match either G_AMDGPU_FMED3 or
+// G_INTRINSIC @llvm.amdgcn.fmed3. Currently the legalizer will
+// replace the intrinsic with G_AMDGPU_FMED3 since we can't write a
+// pattern to match it.
+def expand_promoted_fmed3 : GICombineRule<
+ (defs root:$fptrunc_dst),
+ (match (G_FPTRUNC $fptrunc_dst, $fmed3_dst):$fptrunc,
+ (G_AMDGPU_FMED3 $fmed3_dst, $src0, $src1, $src2),
+ [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+ (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])
+>;
+
+} // End Predicates = [NotHasMed3_16]
+
+// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
+// Combines which should only apply on VI
+def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
+
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPreLegalizerCombinerHelper",
[all_combines, clamp_i64_to_i16, foldable_fneg]> {
@@ -125,7 +149,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
- [all_combines, gfx6gfx7_combines,
+ [all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, sign_extension_in_reg]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 069baf748bfa..78fdedc0b511 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -380,3 +380,56 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
MI.eraseFromParent();
}
+
+// TODO: Should return converted value / extension source and avoid introducing
+// intermediate fptruncs in the apply function.
+static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
+ Register Reg) {
+ const MachineInstr *Def = MRI.getVRegDef(Reg);
+ if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
+ Register SrcReg = Def->getOperand(1).getReg();
+ return MRI.getType(SrcReg) == LLT::scalar(16);
+ }
+
+ if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
+ APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
+ bool LosesInfo = true;
+ Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ return !LosesInfo;
+ }
+
+ return false;
+}
+
+bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
+ Register Src0,
+ Register Src1,
+ Register Src2) {
+ assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
+ return false;
+
+ return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
+ isFPExtFromF16OrConst(MRI, Src2);
+}
+
+void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
+ Register Src0,
+ Register Src1,
+ Register Src2) {
+ Builder.setInstrAndDebugLoc(MI);
+
+ // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
+ // sources.
+ Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
+ Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
+ Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
+
+ LLT Ty = MRI.getType(Src0);
+ auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
+ auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
+ auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
+ Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
+ MI.eraseFromParent();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
index eb72b342dbf2..a68a4444aa5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -23,4 +23,9 @@ class AMDGPUCombinerHelper : public CombinerHelper {
bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
+
+ bool matchExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
+ Register Src1, Register Src2);
+ void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
+ Register Src1, Register Src2);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 296cc34ef1fc..703a457c4994 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5885,6 +5885,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
+ case Intrinsic::amdgcn_fmed3: {
+ GISelChangeObserver &Observer = Helper.Observer;
+
+ // FIXME: This is to workaround the inability of tablegen match combiners to
+ // match intrinsics in patterns.
+ Observer.changingInstr(MI);
+ MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
+ MI.removeOperand(1);
+ Observer.changedInstr(MI);
+ return true;
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 1f17035be64c..dd26a711cc03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -271,14 +271,11 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI,
// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI,
Register &Reg) {
- if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3)
- return false;
-
// In llvm-ir, clamp is often represented as an intrinsic call to
// @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
- MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
- MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
- MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+ MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+ MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+ MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
if (isFCst(Src0) && !isFCst(Src1))
std::swap(Src0, Src1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 810ed0746e91..5ae5cedec40e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3762,6 +3762,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
case AMDGPU::G_AMDGPU_SMED3:
+ case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c8329730c678..7eefe7c57698 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -772,6 +772,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::INSERT_VECTOR_ELT,
ISD::FCOPYSIGN});
+ if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
+ setTargetDAGCombine(ISD::FP_ROUND);
+
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
setTargetDAGCombine({ISD::LOAD,
@@ -11106,6 +11109,71 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
return DAG.getBuildVector(VecVT, SL, Ops);
}
+/// Return the source of an fp_extend from f16 to f32, or a converted FP
+/// constant.
+static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
+ if (Src.getOpcode() == ISD::FP_EXTEND &&
+ Src.getOperand(0).getValueType() == MVT::f16) {
+ return Src.getOperand(0);
+ }
+
+ if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
+ APFloat Val = CFP->getValueAPF();
+ bool LosesInfo = true;
+ Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ if (!LosesInfo)
+ return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
+ "combine only useful on gfx8");
+
+ SDValue TruncSrc = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f16)
+ return SDValue();
+
+ if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
+ TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse() ||
+ !isNullConstant(N->getOperand(1)))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
+ // and expanding it with min/max saves 1 instruction vs. casting to f32 and
+ // casting back.
+
+ // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
+ // fmin(fmax(a, b), fmax(fmin(a, b), c))
+ SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
+ if (!A)
+ return SDValue();
+
+ SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
+ if (!B)
+ return SDValue();
+
+ SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
+ if (!C)
+ return SDValue();
+
+ // This changes signaling nan behavior. If an input is a signaling nan, it
+ // would have been quieted by the fpext originally. We don't care because
+ // these are unconstrained ops. If we needed to insert quieting canonicalizes
+ // we would be worse off than just doing the promotion.
+ SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
+ SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
+ SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
+ return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
+}
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -11861,6 +11929,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performExtractVectorEltCombine(N, DCI);
case ISD::INSERT_VECTOR_ELT:
return performInsertVectorEltCombine(N, DCI);
+ case ISD::FP_ROUND:
+ return performFPRoundCombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e82c97e51343..04d32d22ad57 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -198,6 +198,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index 9ecd6e26803a..e3457421a490 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -7,7 +7,7 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
-; Legal f16 med3. InstCombine ought to shrink the f32 op to f16.
+; Legal f16 med3. InstCombine ought to shrink the f32 op to f16 so the codegen doesn't really matter for this.
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
@@ -35,15 +35,23 @@ define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16:
; GFX9: ; %bb.0:
@@ -81,15 +89,23 @@ define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_flags:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_flags:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_flags:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_flags:
; GFX9: ; %bb.0:
@@ -181,14 +197,23 @@ define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_k0:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_med3_f32 v0, 2.0, v0, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v2, 2.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, 2.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e32 v2, 2.0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, 2.0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v2, v1
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_k0:
; GFX9: ; %bb.0:
@@ -223,14 +248,23 @@ define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_k1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v2, 2.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, 2.0, v0
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k1:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e32 v2, 2.0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, 2.0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v2, v1
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_k1:
; GFX9: ; %bb.0:
@@ -265,14 +299,23 @@ define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_k2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, 2.0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, 2.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k2:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, 2.0, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_k2:
; GFX9: ; %bb.0:
@@ -308,14 +351,25 @@ define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_k0_k1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x41800000
-; GFX8-NEXT: v_med3_f32 v0, 0, v1, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x4c00
+; GFX8-SDAG-NEXT: v_max_f16_e32 v2, 0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v1, 0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v1, v0
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k1:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x4c00
+; GFX8-GISEL-NEXT: v_min_f16_e32 v2, 0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v2, v0
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v1, v0
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_k0_k1:
; GFX9: ; %bb.0:
@@ -348,13 +402,23 @@ define half @fmed3_f32_fpext_f16_k0_k2(half %arg1) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_k0_k2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_med3_f32 v0, 0, v0, 2.0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v1, 0, v0
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, 2.0, v0
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v1, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k2:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e32 v1, 0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, 2.0, v1
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_k0_k2:
; GFX9: ; %bb.0:
@@ -394,15 +458,23 @@ define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_fabs:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX8-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX8-NEXT: v_cvt_f32_f16_e64 v2, |v2|
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fabs:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e64 v3, |v0|, |v1|
+; GFX8-SDAG-NEXT: v_min_f16_e64 v0, |v0|, |v1|
+; GFX8-SDAG-NEXT: v_max_f16_e64 v0, v0, |v2|
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fabs:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e64 v3, |v0|, |v1|
+; GFX8-GISEL-NEXT: v_max_f16_e64 v0, |v0|, |v1|
+; GFX8-GISEL-NEXT: v_max_f16_e64 v1, v3, |v2|
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_fabs:
; GFX9: ; %bb.0:
@@ -492,15 +564,23 @@ define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_fneg:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX8-NEXT: v_cvt_f32_f16_e64 v1, -v1
-; GFX8-NEXT: v_cvt_f32_f16_e64 v2, -v2
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e64 v3, -v0, -v1
+; GFX8-SDAG-NEXT: v_min_f16_e64 v0, -v0, -v1
+; GFX8-SDAG-NEXT: v_max_f16_e64 v0, v0, -v2
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fneg:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e64 v3, -v0, -v1
+; GFX8-GISEL-NEXT: v_max_f16_e64 v0, -v0, -v1
+; GFX8-GISEL-NEXT: v_max_f16_e64 v1, v3, -v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_fneg:
; GFX9: ; %bb.0:
@@ -596,15 +676,23 @@ define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX8-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
-; GFX8-NEXT: v_cvt_f32_f16_e64 v2, -|v2|
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX8-SDAG-NEXT: v_min_f16_e64 v0, -|v0|, -|v1|
+; GFX8-SDAG-NEXT: v_max_f16_e64 v0, v0, -|v2|
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_min_f16_e64 v3, -|v0|, -|v1|
+; GFX8-GISEL-NEXT: v_max_f16_e64 v0, -|v0|, -|v1|
+; GFX8-GISEL-NEXT: v_max_f16_e64 v1, v3, -|v2|
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
; GFX9: ; %bb.0:
@@ -715,11 +803,10 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
@@ -773,17 +860,29 @@ define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2,
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use_0:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX8-NEXT: flat_store_dword v[3:4], v5
-; GFX8-NEXT: v_med3_f32 v0, v5, v0, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX8-SDAG-NEXT: flat_store_dword v[3:4], v5
+; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_0:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX8-GISEL-NEXT: flat_store_dword v[3:4], v5
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_0:
; GFX9: ; %bb.0:
@@ -828,17 +927,29 @@ define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2,
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use_1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT: flat_store_dword v[3:4], v1
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX8-SDAG-NEXT: flat_store_dword v[3:4], v5
+; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_1:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX8-GISEL-NEXT: flat_store_dword v[3:4], v5
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_1:
; GFX9: ; %bb.0:
@@ -883,17 +994,29 @@ define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2,
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use_2:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: flat_store_dword v[3:4], v2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX8-SDAG-NEXT: flat_store_dword v[3:4], v5
+; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_2:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX8-GISEL-NEXT: flat_store_dword v[3:4], v5
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_2:
; GFX9: ; %bb.0:
@@ -944,11 +1067,10 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
; GFX8-GISEL-LABEL: fmed3_f32_fpext_bf16:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmed3_f32_fpext_bf16:
@@ -1006,11 +1128,10 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_bf16_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_bf16_0:
@@ -1070,11 +1191,10 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_bf16_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_bf16_1:
@@ -1134,11 +1254,10 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_bf16_2:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2
+; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_bf16_2:
More information about the llvm-commits
mailing list