[llvm] [MachineLICM] Use `RegisterClassInfo::getRegPressureSetLimit` (PR #119826)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 12 23:01:48 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-llvm-transforms
Author: Pengcheng Wang (wangpc-pp)
<details>
<summary>Changes</summary>
`RegisterClassInfo::getRegPressureSetLimit` is a wrapper of
`TargetRegisterInfo::getRegPressureSetLimit` with some logics to
adjust the limit by removing reserved registers.
It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit`
directly, just like the comment "This limit must be adjusted
dynamically for reserved registers" said.
Separate from https://github.com/llvm/llvm-project/pull/118787
---
Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff
50 Files Affected:
- (modified) llvm/lib/CodeGen/MachineLICM.cpp (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+345-325)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+345-325)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+20-24)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+3113-3099)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+2235-2219)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+2235-2219)
- (modified) llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll (+45-50)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+1256-1268)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+2849-2831)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+2849-2831)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+1174-1190)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+926-780)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+929-743)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+1067-967)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+1496-1510)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+2487-2500)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+2487-2500)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+1184-1200)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+368-338)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+982-962)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+327-328)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+327-328)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+36-34)
- (modified) llvm/test/CodeGen/AMDGPU/licm-regpressure.mir (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll (+75-75)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll (+75-75)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+522-512)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+688-692)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+688-692)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+558-562)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+124-168)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (+2-5)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll (+3-3)
- (modified) llvm/test/CodeGen/LoongArch/jr-without-ra.ll (+56-56)
- (modified) llvm/test/CodeGen/NVPTX/misched_func_call.ll (+3-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll (+3-2)
- (modified) llvm/test/CodeGen/Thumb2/mve-blockplacement.ll (+63-65)
- (modified) llvm/test/CodeGen/Thumb2/mve-gather-increment.ll (+383-405)
- (modified) llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll (+70-70)
- (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+13-15)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/119826
More information about the llvm-commits
mailing list