[llvm] [CodeGen] MachineLICM: Do not consider "loop liveins" as loop defs (PR #121769)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 6 06:10:11 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Gaƫtan Bossu (gbossu)
<details>
<summary>Changes</summary>
This allows to hoist instructions using registers that are not re-defined in the loop. Previously, MachineLICM basically could not hoist any instruction using register inputs. For more context, feel free to refer to the [discourse post I created some time ago](https://discourse.llvm.org/t/extending-post-regalloc-machinelicm/82725).
For what it's worth, I added a small .mir "unit-test" using AArch64 as the target. AFAIK, there aren't many existing tests for the post-RA MachineLICM that I can easily extend.
There are test updates for RISCV, AArch64 and AMDGPU. They seem correct, but I'm no expert :)
---
Patch is 21.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121769.diff
9 Files Affected:
- (modified) llvm/lib/CodeGen/MachineLICM.cpp (+12-8)
- (added) llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir (+217)
- (modified) llvm/test/CodeGen/AArch64/peephole-and-tst.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll (+16-18)
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (+2-1)
- (modified) llvm/test/CodeGen/RISCV/rvv/pr95865.ll (+1-1)
- (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+2-1)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..0ae1f98468a3dc 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -550,6 +550,8 @@ void MachineLICMImpl::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
// If it's using a non-loop-invariant register, then it's obviously
// not safe to hoist.
+ // Note this isn't a final check, as we haven't gathered all the loop
+ // register definitions yet.
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
HasNonInvariantUse = true;
break;
@@ -627,14 +629,6 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
const MachineLoop *ML = MLI->getLoopFor(BB);
if (ML && ML->getHeader()->isEHPad()) continue;
- // Conservatively treat live-in's as an external def.
- // FIXME: That means a reload that're reused in successor block(s) will not
- // be LICM'ed.
- for (const auto &LI : BB->liveins()) {
- for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
- RUDefs.set(*RUI);
- }
-
// Funclet entry blocks will clobber all registers
if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
@@ -644,6 +638,16 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
}
+ // Mark registers as clobbered if they are livein and also defined in the loop
+ for (const auto &LoopLI : CurLoop->getHeader()->liveins()) {
+ MCPhysReg LoopLiveInReg = LoopLI.PhysReg;
+ for (MCRegUnitIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) {
+ if (RUDefs.test(*RUI)) {
+ RUClobbers.set(*RUI);
+ }
+ }
+ }
+
// Gather the registers read / clobbered by the terminator.
BitVector TermRUs(NumRegUnits);
MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
diff --git a/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
new file mode 100644
index 00000000000000..49d569f73a83ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
@@ -0,0 +1,217 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machinelicm -o - %s | FileCheck %s
+
+# A couple of "unit" tests for the post-ra MachineLICM pass.
+
+# Positive test to show MachineLICM can hoist instructions with register operands.
+---
+name: hoist_invariant_add
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: hoist_invariant_add
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1, $w2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
+
+# The first ADDWri does not have loop-invariant source operands,
+# it cannot be hoisted.
+---
+name: nohoist_variable_add_operands
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: nohoist_variable_add_operands
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri renamable $w1, 1, 0
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: renamable $w1 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri renamable $w1, 1, 0
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ renamable $w1 = ADDWri killed renamable $w1, 1, 0
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
+
+# w2 is a loop-livein, but it is also redefined in the loop by ADDWri.
+# The latter cannot be hoisted.
+---
+name: nohoist_redef_livein_reg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: nohoist_redef_livein_reg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 0, 0
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1, $w2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri killed renamable $w1, 0, 0
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1, $w2
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
+
+# $w2 = ADDWri $w1, 1 cannot be hoisted because w2 is later clobbered
+# through an assignment to x2 (an alias of w2).
+---
+name: nohoist_clobber_through_alias_reg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: nohoist_clobber_through_alias_reg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: $x2 = MOVi64imm 0
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ $x2 = MOVi64imm 0
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 17ad2983abe905..c9a89bd7b6e4fc 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -9,6 +9,7 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
; CHECK-LABEL: test_func_i32_two_uses:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:ptr_wrapper
+; CHECK-NEXT: and w11, w2, w0
; CHECK-NEXT: ldr x8, [x8, :got_lo12:ptr_wrapper]
; CHECK-NEXT: ldr x9, [x8]
; CHECK-NEXT: mov w8, wzr
@@ -21,7 +22,6 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
; CHECK-NEXT: .LBB0_3: // %do.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ands w10, w1, w0
-; CHECK-NEXT: and w11, w2, w0
; CHECK-NEXT: cinc w8, w8, ne
; CHECK-NEXT: cmp w10, w11
; CHECK-NEXT: b.eq .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e71c6cf71c8823..1c55b94f527134 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -9307,6 +9307,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 8
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; SI-MOVREL-NEXT: s_branch .LBB26_2
; SI-MOVREL-NEXT: .LBB26_1:
; SI-MOVREL-NEXT: ; implicit-def: $vgpr0
@@ -9314,14 +9316,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; SI-MOVREL-NEXT: .LBB26_2: ; %bb2
; SI-MOVREL-NEXT: ; =>This Loop Header: Depth=1
; SI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2
-; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; SI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1
; SI-MOVREL-NEXT: ; %bb.3: ; %bb4
; SI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1
; SI-MOVREL-NEXT: buffer_load_dword v16, off, s[0:3], 0 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec
; SI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9343,6 +9343,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-MOVREL: ; %bb.0: ; %bb
; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 8
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT: s_branch .LBB26_2
; VI-MOVREL-NEXT: .LBB26_1:
; VI-MOVREL-NEXT: ; implicit-def: $vgpr0
@@ -9350,14 +9352,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-MOVREL-NEXT: .LBB26_2: ; %bb2
; VI-MOVREL-NEXT: ; =>This Loop Header: Depth=1
; VI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2
-; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; VI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1
; VI-MOVREL-NEXT: ; %bb.3: ; %bb4
; VI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1
; VI-MOVREL-NEXT: flat_load_dword v16, v[0:1] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT: s_mov_b64 s[2:3], exec
; VI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9379,6 +9379,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-IDXMODE: ; %bb.0: ; %bb
; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 8
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; VI-IDXMODE-NEXT: s_branch .LBB26_2
; VI-IDXMODE-NEXT: .LBB26_1:
; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0
@@ -9386,14 +9388,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-IDXMODE-NEXT: .LBB26_2: ; %bb2
; VI-IDXMODE-NEXT: ; =>This Loop Header: Depth=1
; VI-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2
-; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; VI-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1
; VI-IDXMODE-NEXT: ; %bb.3: ; %bb4
; VI-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1
; VI-IDXMODE-NEXT: flat_load_dword v16, v[0:1] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; VI-IDXMODE-NEXT: s_mov_b64 s[2:3], exec
; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9416,6 +9416,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; GFX9-IDXMODE: ; %bb.0: ; %bb
; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; GFX9-IDXMODE-NEXT: s_branch .LBB26_2
; GFX9-IDXMODE-NEXT: .LBB26_1:
; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0
@@ -9423,14 +9425,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; GFX9-IDXMODE-NEXT: .LBB26_2: ; %bb2
; GFX9-IDXMODE-NEXT: ; =>This Loop Header: Depth=1
; GFX9-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2
-; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; GFX9-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1
; GFX9-IDXMODE-NEXT: ; %bb.3: ; %bb4
; GFX9-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1
; GFX9-IDXMODE-NEXT: global_load_dword v16, v[0:1], off glc
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; GFX9-IDXMODE-NEXT: s_mov_b64 s[2:3], exec
; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 5c5a769178dd94..61fcf7be8efa4b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -171,10 +171,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_cbranch_vccz .LBB2_12
; GFX11-NEXT: ; %bb.9:
; GFX11-NEXT: s_xor_b32 s0, s8, -1
-; GFX11-NEXT: .LBB2_10: ; %bb17
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT: .LBB2_10: ; %bb17
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_cbranch_vccz .LBB2_10
; GFX11-NEXT: ; %bb.11: ; %Flow6
; GFX11-NEXT: s_mov_b32 s18, -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674acec..5059081422d66c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -10,8 +10,15 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_mov_b32 s8, s4
+; GFX10-NEXT: s_mov_b32 s9, s4
+; GFX10-NEXT: s_mov_b32 s10, s4
+; GFX10-NEXT: s_mov_b32 s11, s4
+; GFX10-NEXT: s_mov_b32 s12, s4
+; GFX10-NEXT: s_mov_b32 s13, s4
+; GFX10-NEXT: s_mov_b32 s14, s4
+; GFX10-NEXT: s_mov_b32 s15, s4
; GFX10-NEXT: ; implicit-def: $sgpr2
-; GFX10-NEXT: s_inst_prefetch 0x1
; GFX10-NEXT: s_branch .LBB0_2
; GFX10-NEXT: .p2align 6
; GFX10-NEXT: .LBB0_1: ; %Flow
@@ -32,14 +39,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s7, s4
-; GFX10-NEXT: s_mov_b32 s8, s4
-; GFX10-NEXT: s_mov_b32 s9, s4
-; GFX10-NEXT: s_mov_b32 s10, s4
-; GFX10-NEXT: s_mov_b32 s11, s4
-; GFX10-NEXT: s_mov_b32 s12, s4
-; GFX10-NEXT: s_mov_b32 s13, s4
-; GFX10-NEXT: s_mov_b32 s14, s4
-; GFX10-NEXT: s_mov_b32 s15, s4
; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
; GFX10-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -49,7 +48,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_or_b32 s2, s2, s0
; GFX10-NEXT: s_branch .LBB0_1
; GFX10-NEXT: .LBB0_4: ; %loop0_merge
-; GFX10-NEXT: s_inst_prefetch 0x2
; GFX10-NEXT: s_endpgm
;
; GFX12-LABEL: _amdgpu_cs_main:
@@ -59,6 +57,14 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: s_mov_b32 s8, s4
+; GFX12-NEXT: s_mov_b32 s9, s4
+; GFX12-NEXT: s_mov_b32 s10, s4
+; GFX12-NEXT: s_mov_b32 s11, s4
+; GFX12-NEXT: s_mov_b32 s12, s4
+; GFX12-NEXT: s_mov_b32 s13, s4
+; GFX12-NEXT: s_mov_b32 s14, s4
+; GFX12-NEXT: s_mov_b32 s15, s4
; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: s_branch .LBB0_2
; GFX12-NEXT: .LBB0_1: ; %Flow
@@ -80,14 +86,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX12-NEXT: s_mov_b32 s5, s4
; GFX12-NEXT: s_mov_b32 s6, s4
; GFX12-NEXT: s_mov_b32 s7, s4
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s4
-; GFX12-NEXT: s_mov_b32 s10, s4
-; GFX12-NEXT: s_mov_b32 s11, s4
-; GFX12-NEXT: s_mov_b32 s12, s4
-; GFX12-NEXT: s_mov_b32 s13, s4
-; GFX12-NEXT: s_mov_b32 s14, s4
-; GFX12-NEXT: s_mov_b32 s15, s4
; GFX12-NEXT: s_and_not1_b32 s2, s2, exec_lo
; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
; GFX12-NEXT: s_wait_samplecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 0acee5bd5ac19d..a3d1214100cd5b 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -56,9 +56,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
; CHECK-NEXT: s_waitcnt expcnt(1)
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: .LBB1_1: ; %bb9
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB1_1
; CHECK-NEXT: ; %bb.2: ; %bb11
; CHECK-NEXT: s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index 06a357eeaeb61e..2f...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/121769
More information about the llvm-commits
mailing list