[llvm] AMDGPU: Don't canonicalize fminnum/fmaxnum if targets support IEEE fminimum(maximum)_num (PR #127711)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 14:54:43 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
@llvm/pr-subscribers-backend-amdgpu
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
For targets that support IEEE fminimum_num/fmaximum_num, the corresponding *_min_num_fXY/*_max_num_fXY instructions themselves already did the canonicalization for the inputs. As a result, we do not need to explicitly canonicalize the inputs for fminnum/fmaxnum.
---
Patch is 287.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127711.diff
22 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h (+2-1)
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+2-1)
- (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+2-2)
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+3-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+2-1)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+4)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+23-38)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+23-38)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll (-8)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+84-111)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+84-111)
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+1-8)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+187-267)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+187-267)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+14-16)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+127-191)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+127-191)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+27-45)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+27-45)
- (modified) llvm/test/CodeGen/AMDGPU/minmax.ll (+26-76)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 4e18f5cc913a7..50eff989feda0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -425,7 +425,8 @@ class LegalizerHelper {
LegalizeResult lowerThreewayCompare(MachineInstr &MI);
LegalizeResult lowerMinMax(MachineInstr &MI);
LegalizeResult lowerFCopySign(MachineInstr &MI);
- LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+ LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI,
+ bool ShouldCanonicalize = true);
LegalizeResult lowerFMad(MachineInstr &MI);
LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
LegalizeResult lowerFFloor(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a4c3d042fe3a4..7ec945d3a0108 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5314,7 +5314,8 @@ class TargetLowering : public TargetLoweringBase {
SelectionDAG &DAG) const;
/// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
- SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
+ SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG,
+ bool ShouldCanonicalize = true) const;
/// Expand fminimum/fmaximum into multiple comparison with selects.
SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d4cb224c35d74..319c4ac28c167 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8137,14 +8137,14 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
}
LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
+LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI, bool ShouldCanonicalize) {
unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
auto [Dst, Src0, Src1] = MI.getFirst3Regs();
LLT Ty = MRI.getType(Dst);
- if (!MI.getFlag(MachineInstr::FmNoNans)) {
+ if (ShouldCanonicalize && !MI.getFlag(MachineInstr::FmNoNans)) {
// Insert canonicalizes if it's possible we need to quiet to get correct
// sNaN behavior.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7771958f5adc9..5804a42172a7b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8488,7 +8488,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
}
SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG,
+ bool ShouldCanonicalize) const {
if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG))
return Expanded;
@@ -8505,7 +8506,7 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
SDValue Quiet0 = Node->getOperand(0);
SDValue Quiet1 = Node->getOperand(1);
- if (!Node->getFlags().hasNoNaNs()) {
+ if (ShouldCanonicalize && !Node->getFlags().hasNoNaNs()) {
// Insert canonicalizes if it's possible we need to quiet to get correct
// sNaN behavior.
if (!DAG.isKnownNeverSNaN(Quiet0)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 649deee346e90..4ce8ffb39599b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2710,7 +2710,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
if (IsIEEEOp)
return true;
- return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
+ return Helper.lowerFMinNumMaxNum(MI, !ST.hasIEEEMinNumMaxNum()) ==
+ LegalizerHelper::Legalized;
}
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 342b211199dca..418ca2b3b9e39 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1431,6 +1431,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if the target has IEEE fminimum/fmaximum instructions
bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
+ // \returns true if the target has IEEE fminimum_num/fmaximum_num
+ // instructions
+ bool hasIEEEMinNumMaxNum() const { return getGeneration() >= GFX12; }
+
// \returns true if the target has IEEE fminimum3/fmaximum3 instructions
bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e09b310d107ac..ccce19348bb3d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6833,7 +6833,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
// mode functions, but this happens to be OK since it's only done in cases
// where there is known no sNaN.
if (IsIEEEMode)
- return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+ return expandFMINNUM_FMAXNUM(Op.getNode(), DAG,
+ !Subtarget->hasIEEEMinNumMaxNum());
if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
VT == MVT::v16bf16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index d1a303b41deef..ed0a522f6c11d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -602,15 +602,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -757,21 +755,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, s16
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
+; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index b8538cbf254fc..0d02c0d8cb464 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -602,15 +602,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -757,21 +755,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/127711
More information about the llvm-commits
mailing list