[llvm] [CodeGen] MachineLICM: Do not consider "loop liveins" as loop defs (PR #121769)
Gaƫtan Bossu via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 6 07:20:41 PST 2025
https://github.com/gbossu updated https://github.com/llvm/llvm-project/pull/121769
>From 7d8ed8a9824dedbb365612ecf7caca2ee6172490 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at amd.com>
Date: Mon, 6 Jan 2025 10:32:55 +0000
Subject: [PATCH] [CodeGen] MachineLICM: Do not consider "loop liveins" as loop
defs
This allows to hoist instructions using registers that are not
re-defined in the loop. Previous MachineLICM basically could not hoist
any instruction using register inputs.
---
llvm/lib/CodeGen/MachineLICM.cpp | 20 +-
.../CodeGen/AArch64/machinelicm-post-ra.mir | 217 ++++++++++++++++++
llvm/test/CodeGen/AArch64/peephole-and-tst.ll | 2 +-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 16 +-
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 4 +-
.../CodeGen/AMDGPU/no-dup-inst-prefetch.ll | 34 ++-
.../AMDGPU/undefined-subreg-liverange.ll | 3 +-
llvm/test/CodeGen/RISCV/rvv/pr95865.ll | 2 +-
.../InferAddressSpaces/AMDGPU/flat_atomic.ll | 3 +-
9 files changed, 261 insertions(+), 40 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..0ae1f98468a3dc 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -550,6 +550,8 @@ void MachineLICMImpl::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
// If it's using a non-loop-invariant register, then it's obviously
// not safe to hoist.
+ // Note this isn't a final check, as we haven't gathered all the loop
+ // register definitions yet.
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
HasNonInvariantUse = true;
break;
@@ -627,14 +629,6 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
const MachineLoop *ML = MLI->getLoopFor(BB);
if (ML && ML->getHeader()->isEHPad()) continue;
- // Conservatively treat live-in's as an external def.
- // FIXME: That means a reload that're reused in successor block(s) will not
- // be LICM'ed.
- for (const auto &LI : BB->liveins()) {
- for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
- RUDefs.set(*RUI);
- }
-
// Funclet entry blocks will clobber all registers
if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
@@ -644,6 +638,16 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
}
+ // Mark registers as clobbered if they are livein and also defined in the loop
+ for (const auto &LoopLI : CurLoop->getHeader()->liveins()) {
+ MCPhysReg LoopLiveInReg = LoopLI.PhysReg;
+ for (MCRegUnitIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) {
+ if (RUDefs.test(*RUI)) {
+ RUClobbers.set(*RUI);
+ }
+ }
+ }
+
// Gather the registers read / clobbered by the terminator.
BitVector TermRUs(NumRegUnits);
MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
diff --git a/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
new file mode 100644
index 00000000000000..49d569f73a83ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
@@ -0,0 +1,217 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machinelicm -o - %s | FileCheck %s
+
+# A couple of "unit" tests for the post-ra MachineLICM pass.
+
+# Positive test to show MachineLICM can hoist instructions with register operands.
+---
+name: hoist_invariant_add
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: hoist_invariant_add
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1, $w2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
+
+# The first ADDWri does not have loop-invariant source operands,
+# it cannot be hoisted.
+---
+name: nohoist_variable_add_operands
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: nohoist_variable_add_operands
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri renamable $w1, 1, 0
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: renamable $w1 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri renamable $w1, 1, 0
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ renamable $w1 = ADDWri killed renamable $w1, 1, 0
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
+
+# w2 is a loop-livein, but it is also redefined in the loop by ADDWri.
+# The latter cannot be hoisted.
+---
+name: nohoist_redef_livein_reg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: nohoist_redef_livein_reg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 0, 0
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1, $w2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri killed renamable $w1, 0, 0
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1, $w2
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
+
+# $w2 = ADDWri $w1, 1 cannot be hoisted because w2 is later clobbered
+# through an assignment to x2 (an alias of w2).
+---
+name: nohoist_clobber_through_alias_reg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: nohoist_clobber_through_alias_reg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ ; CHECK-NEXT: renamable $w0 = SUBWrr killed renamable $w0, $w2
+ ; CHECK-NEXT: $x2 = MOVi64imm 0
+ ; CHECK-NEXT: CBZW renamable $w0, %bb.3
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $w0, $w1
+ CBZW renamable $w0, %bb.3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $w0, $w1
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ liveins: $w0, $w1
+ renamable $w2 = ADDWri killed renamable $w1, 1, 0
+ renamable $w0 = SUBWrr killed renamable $w0, $w2
+ $x2 = MOVi64imm 0
+ CBZW renamable $w0, %bb.3
+ B %bb.2
+
+ bb.3:
+ RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 17ad2983abe905..c9a89bd7b6e4fc 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -9,6 +9,7 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
; CHECK-LABEL: test_func_i32_two_uses:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:ptr_wrapper
+; CHECK-NEXT: and w11, w2, w0
; CHECK-NEXT: ldr x8, [x8, :got_lo12:ptr_wrapper]
; CHECK-NEXT: ldr x9, [x8]
; CHECK-NEXT: mov w8, wzr
@@ -21,7 +22,6 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
; CHECK-NEXT: .LBB0_3: // %do.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ands w10, w1, w0
-; CHECK-NEXT: and w11, w2, w0
; CHECK-NEXT: cinc w8, w8, ne
; CHECK-NEXT: cmp w10, w11
; CHECK-NEXT: b.eq .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e71c6cf71c8823..1c55b94f527134 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -9307,6 +9307,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 8
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; SI-MOVREL-NEXT: s_branch .LBB26_2
; SI-MOVREL-NEXT: .LBB26_1:
; SI-MOVREL-NEXT: ; implicit-def: $vgpr0
@@ -9314,14 +9316,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; SI-MOVREL-NEXT: .LBB26_2: ; %bb2
; SI-MOVREL-NEXT: ; =>This Loop Header: Depth=1
; SI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2
-; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; SI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1
; SI-MOVREL-NEXT: ; %bb.3: ; %bb4
; SI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1
; SI-MOVREL-NEXT: buffer_load_dword v16, off, s[0:3], 0 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec
; SI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9343,6 +9343,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-MOVREL: ; %bb.0: ; %bb
; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 8
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT: s_branch .LBB26_2
; VI-MOVREL-NEXT: .LBB26_1:
; VI-MOVREL-NEXT: ; implicit-def: $vgpr0
@@ -9350,14 +9352,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-MOVREL-NEXT: .LBB26_2: ; %bb2
; VI-MOVREL-NEXT: ; =>This Loop Header: Depth=1
; VI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2
-; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; VI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1
; VI-MOVREL-NEXT: ; %bb.3: ; %bb4
; VI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1
; VI-MOVREL-NEXT: flat_load_dword v16, v[0:1] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT: s_mov_b64 s[2:3], exec
; VI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9379,6 +9379,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-IDXMODE: ; %bb.0: ; %bb
; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 8
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; VI-IDXMODE-NEXT: s_branch .LBB26_2
; VI-IDXMODE-NEXT: .LBB26_1:
; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0
@@ -9386,14 +9388,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; VI-IDXMODE-NEXT: .LBB26_2: ; %bb2
; VI-IDXMODE-NEXT: ; =>This Loop Header: Depth=1
; VI-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2
-; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; VI-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1
; VI-IDXMODE-NEXT: ; %bb.3: ; %bb4
; VI-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1
; VI-IDXMODE-NEXT: flat_load_dword v16, v[0:1] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; VI-IDXMODE-NEXT: s_mov_b64 s[2:3], exec
; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9416,6 +9416,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; GFX9-IDXMODE: ; %bb.0: ; %bb
; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; GFX9-IDXMODE-NEXT: s_branch .LBB26_2
; GFX9-IDXMODE-NEXT: .LBB26_1:
; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0
@@ -9423,14 +9425,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; GFX9-IDXMODE-NEXT: .LBB26_2: ; %bb2
; GFX9-IDXMODE-NEXT: ; =>This Loop Header: Depth=1
; GFX9-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2
-; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
; GFX9-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1
; GFX9-IDXMODE-NEXT: ; %bb.3: ; %bb4
; GFX9-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1
; GFX9-IDXMODE-NEXT: global_load_dword v16, v[0:1], off glc
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
; GFX9-IDXMODE-NEXT: s_mov_b64 s[2:3], exec
; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 5c5a769178dd94..61fcf7be8efa4b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -171,10 +171,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_cbranch_vccz .LBB2_12
; GFX11-NEXT: ; %bb.9:
; GFX11-NEXT: s_xor_b32 s0, s8, -1
-; GFX11-NEXT: .LBB2_10: ; %bb17
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT: .LBB2_10: ; %bb17
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_cbranch_vccz .LBB2_10
; GFX11-NEXT: ; %bb.11: ; %Flow6
; GFX11-NEXT: s_mov_b32 s18, -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674acec..5059081422d66c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -10,8 +10,15 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_mov_b32 s8, s4
+; GFX10-NEXT: s_mov_b32 s9, s4
+; GFX10-NEXT: s_mov_b32 s10, s4
+; GFX10-NEXT: s_mov_b32 s11, s4
+; GFX10-NEXT: s_mov_b32 s12, s4
+; GFX10-NEXT: s_mov_b32 s13, s4
+; GFX10-NEXT: s_mov_b32 s14, s4
+; GFX10-NEXT: s_mov_b32 s15, s4
; GFX10-NEXT: ; implicit-def: $sgpr2
-; GFX10-NEXT: s_inst_prefetch 0x1
; GFX10-NEXT: s_branch .LBB0_2
; GFX10-NEXT: .p2align 6
; GFX10-NEXT: .LBB0_1: ; %Flow
@@ -32,14 +39,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s7, s4
-; GFX10-NEXT: s_mov_b32 s8, s4
-; GFX10-NEXT: s_mov_b32 s9, s4
-; GFX10-NEXT: s_mov_b32 s10, s4
-; GFX10-NEXT: s_mov_b32 s11, s4
-; GFX10-NEXT: s_mov_b32 s12, s4
-; GFX10-NEXT: s_mov_b32 s13, s4
-; GFX10-NEXT: s_mov_b32 s14, s4
-; GFX10-NEXT: s_mov_b32 s15, s4
; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
; GFX10-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -49,7 +48,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_or_b32 s2, s2, s0
; GFX10-NEXT: s_branch .LBB0_1
; GFX10-NEXT: .LBB0_4: ; %loop0_merge
-; GFX10-NEXT: s_inst_prefetch 0x2
; GFX10-NEXT: s_endpgm
;
; GFX12-LABEL: _amdgpu_cs_main:
@@ -59,6 +57,14 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: s_mov_b32 s8, s4
+; GFX12-NEXT: s_mov_b32 s9, s4
+; GFX12-NEXT: s_mov_b32 s10, s4
+; GFX12-NEXT: s_mov_b32 s11, s4
+; GFX12-NEXT: s_mov_b32 s12, s4
+; GFX12-NEXT: s_mov_b32 s13, s4
+; GFX12-NEXT: s_mov_b32 s14, s4
+; GFX12-NEXT: s_mov_b32 s15, s4
; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: s_branch .LBB0_2
; GFX12-NEXT: .LBB0_1: ; %Flow
@@ -80,14 +86,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX12-NEXT: s_mov_b32 s5, s4
; GFX12-NEXT: s_mov_b32 s6, s4
; GFX12-NEXT: s_mov_b32 s7, s4
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s4
-; GFX12-NEXT: s_mov_b32 s10, s4
-; GFX12-NEXT: s_mov_b32 s11, s4
-; GFX12-NEXT: s_mov_b32 s12, s4
-; GFX12-NEXT: s_mov_b32 s13, s4
-; GFX12-NEXT: s_mov_b32 s14, s4
-; GFX12-NEXT: s_mov_b32 s15, s4
; GFX12-NEXT: s_and_not1_b32 s2, s2, exec_lo
; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
; GFX12-NEXT: s_wait_samplecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 0acee5bd5ac19d..a3d1214100cd5b 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -56,9 +56,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
; CHECK-NEXT: s_waitcnt expcnt(1)
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: .LBB1_1: ; %bb9
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB1_1
; CHECK-NEXT: ; %bb.2: ; %bb11
; CHECK-NEXT: s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index 06a357eeaeb61e..2f8efc511632b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -46,6 +46,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: andi t3, a4, 1
; CHECK-NEXT: li t2, 4
+; CHECK-NEXT: andi s10, a0, 1
; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader.i
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB0_2 Depth 2
@@ -105,7 +106,6 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: addi s8, s8, 4
; CHECK-NEXT: addi ra, ra, 4
; CHECK-NEXT: addi a3, a3, 4
-; CHECK-NEXT: andi s10, a0, 1
; CHECK-NEXT: addi s11, s11, 4
; CHECK-NEXT: beqz s10, .LBB0_4
; CHECK-NEXT: # %bb.7: # %for.cond.cleanup11.i
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index 6633cec659d8e5..90778606a8e5b3 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -129,9 +129,10 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: .LBB3_1: ; %bb0
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB3_1
; CHECK-NEXT: ; %bb.2: ; %bb1
; CHECK-NEXT: s_mov_b64 s[0:1], exec
More information about the llvm-commits
mailing list