[llvm] expandFMINIMUMNUM_FMAXIMUMNUM: Improve compare between zeros (PR #140193)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 15 22:40:00 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: YunQiang Su (wzssyqa)
<details>
<summary>Changes</summary>
1. On GPR32 platform, expandIS_FPCLASS may fail due to ISD::BITCAST
double to int64 may fail. Let's FP_ROUND double to float first.
Since we use it if MinMax is zero only, so the flushing won't
break anything.
2. Only one IS_FPCLASS is needed. MinMax will always be RHS if equal.
So we can select between LHS and MinMax.
It will even safe if FP_ROUND flush a small LHS, as if LHS is not zero
then, MinMax won't be Zero, so we will always use MinMax.
---
Patch is 2.87 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140193.diff
7 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+21-11)
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+193-493)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+217-518)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+7105-12288)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+7214-12382)
- (modified) llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll (+880)
- (modified) llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll (+116-162)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f17d6a2787889..5166bc0ecd3b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8681,13 +8681,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
RHS = DAG.getSelectCC(DL, RHS, RHS, LHS, RHS, ISD::SETUO);
}
+ // Please always prefer RHS if equal.
SDValue MinMax =
DAG.getSelectCC(DL, LHS, RHS, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
- // If MinMax is NaN, let's quiet it.
- if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) &&
- !DAG.isKnownNeverNaN(RHS)) {
- MinMax = DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
- }
// Fixup signed zero behavior.
if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() ||
@@ -8698,13 +8694,27 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
- SDValue LCmp = DAG.getSelect(
- DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+ unsigned BitSize = VT.getScalarSizeInBits();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize);
+ EVT FloatVT = EVT::getFloatingPointVT(32);
+ if (VT.isVector()) {
+ IntVT =
+ EVT::getVectorVT(*DAG.getContext(), IntVT, VT.getVectorElementCount());
+ FloatVT = EVT::getVectorVT(*DAG.getContext(), FloatVT,
+ VT.getVectorElementCount());
+ }
+ SDValue LHSTrunc = LHS;
+ if (!isOperationLegal(ISD::BITCAST, IntVT) &&
+ !isOperationLegal(ISD::IS_FPCLASS, VT)) {
+ LHSTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, LHS,
+ DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+ }
+ // It's OK to select from LHS and MinMax, with only one ISD::IS_FPCLASS, as
+ // we preferred RHS when generate MinMax, if the operands are equal.
+ SDValue RetZero = DAG.getSelect(
+ DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHSTrunc, TestZero), LHS,
MinMax, Flags);
- SDValue RCmp = DAG.getSelect(
- DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, LCmp,
- Flags);
- return DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
+ return DAG.getSelect(DL, VT, IsZero, RetZero, MinMax, Flags);
}
/// Returns a true value if if this FPClassTest can be performed with an ordered
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index d458bb2492f23..02ea2cc2a1919 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1696,23 +1696,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -1722,22 +1712,14 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
-; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1753,22 +1735,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v3, v4
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX900-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX900-SDAG-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-SDAG-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX900-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX900-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -1779,21 +1752,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX900-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX900-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX900-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX900-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1812,9 +1777,6 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX950-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v3, v4
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX950-SDAG-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
; GFX950-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
@@ -1838,9 +1800,6 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX950-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX950-SDAG-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
; GFX950-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -1866,22 +1825,14 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -1891,21 +1842,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1921,61 +1864,38 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1997,72 +1917,47 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v3
-; GFX12-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/140193
More information about the llvm-commits
mailing list