[llvm] [MachineScheduler] Fix physreg dependencies of ExitSU (PR #123541)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 19 18:46:25 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Sergei Barannikov (s-barannikov)
<details>
<summary>Changes</summary>
Providing the correct operand index allows addPhysRegDataDeps to compute the correct latency.
---
Patch is 385.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123541.diff
53 Files Affected:
- (modified) llvm/lib/CodeGen/ScheduleDAGInstrs.cpp (+6-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+41-37)
- (modified) llvm/test/CodeGen/AMDGPU/call-args-inreg.ll (+63-63)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+654-693)
- (modified) llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/call-waitcnt.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/cc-update.ll (+56-56)
- (modified) llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+41-41)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+41-41)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+63-63)
- (modified) llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/skip-if-dead.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/swdev373493.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+6-6)
- (modified) llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll (+2-2)
- (modified) llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll (+2-2)
- (modified) llvm/test/CodeGen/ARM/vector-DAGCombine.ll (+1-1)
- (modified) llvm/test/CodeGen/PowerPC/p10-spill-creq.ll (+4-4)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll (+3-3)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll (+3-3)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-float32regloops.ll (+4-4)
- (modified) llvm/test/CodeGen/Thumb2/mve-gather-increment.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll (+9-9)
- (modified) llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll (+4-4)
- (modified) llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll (+1-1)
``````````diff
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 8e3e06bf57153e..b2c3a0109b3a91 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -209,13 +209,17 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
ExitSU.setInstr(ExitMI);
// Add dependencies on the defs and uses of the instruction.
if (ExitMI) {
+ const MCInstrDesc &MIDesc = ExitMI->getDesc();
for (const MachineOperand &MO : ExitMI->all_uses()) {
+ unsigned OpIdx = MO.getOperandNo();
Register Reg = MO.getReg();
if (Reg.isPhysical()) {
+ bool IsRealUse = OpIdx < MIDesc.getNumOperands() ||
+ MIDesc.hasImplicitUseOfPhysReg(Reg);
for (MCRegUnit Unit : TRI->regunits(Reg))
- Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
+ Uses.insert(PhysRegSUOper(&ExitSU, IsRealUse ? OpIdx : -1, Unit));
} else if (Reg.isVirtual() && MO.readsReg()) {
- addVRegUseDeps(&ExitSU, MO.getOperandNo());
+ addVRegUseDeps(&ExitSU, OpIdx);
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 604caf572b0fe8..830e932570b82b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -15,12 +15,12 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, ext at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, ext at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -45,11 +45,11 @@ define ptr addrspace(1) @tail_call_assert_align() {
; CHECK-LABEL: tail_call_assert_align:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, ext at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, ext at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[16:17]
entry:
%call = tail call align 4 ptr addrspace(1) @ext(ptr addrspace(1) null)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 974ce492daea8b..686e7db6facd4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -44,8 +44,8 @@ define amdgpu_kernel void @kernel_caller_stack() {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
-; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s2, s32, 16
+; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
@@ -239,11 +239,11 @@ define void @func_caller_stack() {
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
@@ -274,15 +274,15 @@ define void @func_caller_stack() {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
@@ -312,10 +312,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(1)
@@ -394,8 +394,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: s_add_u32 s2, s32, 56
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 935200d5953072..91e16d91ddd15a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index 287a8ab0e52f52..f1aaf1d59a5dc8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -380,8 +380,8 @@ define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %
define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) {
; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs:
; OLD_RBS: ; %bb.0: ; %A
-; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
; OLD_RBS-NEXT: ; %bb.1: ; %B
; OLD_RBS-NEXT: s_mov_b32 s0, 1
@@ -393,8 +393,8 @@ define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1
;
; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs:
; NEW_RBS: ; %bb.0: ; %A
-; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
; NEW_RBS-NEXT: ; %bb.1: ; %B
; NEW_RBS-NEXT: s_mov_b32 s0, 1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..8083e8e717f6be 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -25,10 +25,10 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2
; FIXEDABI-NEXT: s_addk_i32 s32, 0x400
; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0
-; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_getpc_b64 s[16:17]
; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
+; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
@@ -49,21 +49,21 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
+; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
+; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
+; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
-; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
; FIXEDABI-SDAG-NEXT: s_endpgm
;
@@ -71,21 +71,21 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
+; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
+; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
-; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
; FIXEDABI-GISEL-NEXT: s_endpgm
call void @requires_all_inputs()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9577230c6c52e2..d9ec61b709236f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1291,9 +1291,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1417,9 +1417,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1545,9 +1546,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -3140,8 +3141,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -4846,9 +4847,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -4972,9 +4973,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5100,9 +5102,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -6734,8 +6736,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 45b161d7959f4f..6d6fb6a7db4c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -919,9 +919,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1030,9 +1030,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -2630,8 +2630,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -2812,8 +2812,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -3301,8 +3301,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/123541
More information about the llvm-commits
mailing list