[llvm] [CodeGen] MachineLICM: Do not consider "loop liveins" as loop defs (PR #121769)

Mon Jan 6 06:10:11 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Gaëtan Bossu (gbossu)

<details>
<summary>Changes</summary>

This allows to hoist instructions using registers that are not re-defined in the loop. Previously, MachineLICM basically could not hoist any instruction using register inputs. For more context, feel free to refer to the [discourse post I created some time ago](https://discourse.llvm.org/t/extending-post-regalloc-machinelicm/82725).

For what it's worth, I added a small .mir "unit-test" using AArch64 as the target. AFAIK, there aren't many existing tests for the post-RA MachineLICM that I can easily extend.

There are test updates for RISCV, AArch64 and AMDGPU. They seem correct, but I'm no expert :)

---

Patch is 21.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121769.diff


9 Files Affected:

- (modified) llvm/lib/CodeGen/MachineLICM.cpp (+12-8) 
- (added) llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir (+217) 
- (modified) llvm/test/CodeGen/AArch64/peephole-and-tst.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll (+16-18) 
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (+2-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/pr95865.ll (+1-1) 
- (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+2-1) 


``````````diff

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..0ae1f98468a3dc 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -550,6 +550,8 @@ void MachineLICMImpl::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
         for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
+          // Note this isn't a final check, as we haven't gathered all the loop
+          // register definitions yet.
           if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
             HasNonInvariantUse = true;
             break;
@@ -627,14 +629,6 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
     const MachineLoop *ML = MLI->getLoopFor(BB);
     if (ML && ML->getHeader()->isEHPad()) continue;
 
-    // Conservatively treat live-in's as an external def.
-    // FIXME: That means a reload that're reused in successor block(s) will not
-    // be LICM'ed.
-    for (const auto &LI : BB->liveins()) {
-      for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
-        RUDefs.set(*RUI);
-    }
-
     // Funclet entry blocks will clobber all registers
     if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
       applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
@@ -644,6 +638,16 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
       ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
   }
 
+  // Mark registers as clobbered if they are livein and also defined in the loop
+  for (const auto &LoopLI : CurLoop->getHeader()->liveins()) {
+    MCPhysReg LoopLiveInReg = LoopLI.PhysReg;
+    for (MCRegUnitIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) {
+      if (RUDefs.test(*RUI)) {
+        RUClobbers.set(*RUI);
+      }
+    }
+  }
+
   // Gather the registers read / clobbered by the terminator.
   BitVector TermRUs(NumRegUnits);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
diff --git a/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
new file mode 100644
index 00000000000000..49d569f73a83ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
@@ -0,0 +1,217 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machinelicm -o - %s | FileCheck %s
+
+# A couple of "unit" tests for the post-ra MachineLICM pass.
+
+# Positive test to show MachineLICM can hoist instructions with register operands.
+---
+name:            hoist_invariant_add
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: hoist_invariant_add
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri killed renamable $w1, 1, 0
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
+
+# The first ADDWri does not have loop-invariant source operands,
+# it cannot be hoisted.
+---
+name:            nohoist_variable_add_operands
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: nohoist_variable_add_operands
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri renamable $w1, 1, 0
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   renamable $w1 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri renamable $w1, 1, 0
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    renamable $w1 = ADDWri killed renamable $w1, 1, 0
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
+
+# w2 is a loop-livein, but it is also redefined in the loop by ADDWri.
+# The latter cannot be hoisted.
+---
+name:            nohoist_redef_livein_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: nohoist_redef_livein_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 0, 0
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri killed renamable $w1, 0, 0
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1, $w2
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    renamable $w2 = ADDWri killed renamable $w1, 1, 0
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
+
+# $w2 = ADDWri $w1, 1 cannot be hoisted because w2 is later clobbered
+# through an assignment to x2 (an alias of w2).
+---
+name:            nohoist_clobber_through_alias_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: nohoist_clobber_through_alias_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   $x2 = MOVi64imm 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri killed renamable $w1, 1, 0
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    $x2 = MOVi64imm 0
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 17ad2983abe905..c9a89bd7b6e4fc 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -9,6 +9,7 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-LABEL: test_func_i32_two_uses:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:ptr_wrapper
+; CHECK-NEXT:    and w11, w2, w0
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:ptr_wrapper]
 ; CHECK-NEXT:    ldr x9, [x8]
 ; CHECK-NEXT:    mov w8, wzr
@@ -21,7 +22,6 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-NEXT:  .LBB0_3: // %do.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ands w10, w1, w0
-; CHECK-NEXT:    and w11, w2, w0
 ; CHECK-NEXT:    cinc w8, w8, ne
 ; CHECK-NEXT:    cmp w10, w11
 ; CHECK-NEXT:    b.eq .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e71c6cf71c8823..1c55b94f527134 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -9307,6 +9307,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
+; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-MOVREL-NEXT:    s_branch .LBB26_2
 ; SI-MOVREL-NEXT:  .LBB26_1:
 ; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0
@@ -9314,14 +9316,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; SI-MOVREL-NEXT:  .LBB26_2: ; %bb2
 ; SI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
 ; SI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
-; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; SI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
 ; SI-MOVREL-NEXT:  ; %bb.3: ; %bb4
 ; SI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; SI-MOVREL-NEXT:    buffer_load_dword v16, off, s[0:3], 0 glc
 ; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], exec
 ; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9343,6 +9343,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-MOVREL:       ; %bb.0: ; %bb
 ; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
+; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-MOVREL-NEXT:    s_branch .LBB26_2
 ; VI-MOVREL-NEXT:  .LBB26_1:
 ; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0
@@ -9350,14 +9352,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-MOVREL-NEXT:  .LBB26_2: ; %bb2
 ; VI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
 ; VI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
-; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; VI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
 ; VI-MOVREL-NEXT:  ; %bb.3: ; %bb4
 ; VI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; VI-MOVREL-NEXT:    flat_load_dword v16, v[0:1] glc
 ; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-MOVREL-NEXT:    s_mov_b64 s[2:3], exec
 ; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9379,6 +9379,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-IDXMODE:       ; %bb.0: ; %bb
 ; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
+; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-IDXMODE-NEXT:    s_branch .LBB26_2
 ; VI-IDXMODE-NEXT:  .LBB26_1:
 ; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0
@@ -9386,14 +9388,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-IDXMODE-NEXT:  .LBB26_2: ; %bb2
 ; VI-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
 ; VI-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
-; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; VI-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
 ; VI-IDXMODE-NEXT:  ; %bb.3: ; %bb4
 ; VI-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; VI-IDXMODE-NEXT:    flat_load_dword v16, v[0:1] glc
 ; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
 ; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9416,6 +9416,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GFX9-IDXMODE:       ; %bb.0: ; %bb
 ; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX9-IDXMODE-NEXT:    s_branch .LBB26_2
 ; GFX9-IDXMODE-NEXT:  .LBB26_1:
 ; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0
@@ -9423,14 +9425,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GFX9-IDXMODE-NEXT:  .LBB26_2: ; %bb2
 ; GFX9-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX9-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
-; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; GFX9-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
 ; GFX9-IDXMODE-NEXT:  ; %bb.3: ; %bb4
 ; GFX9-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; GFX9-IDXMODE-NEXT:    global_load_dword v16, v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX9-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 5c5a769178dd94..61fcf7be8efa4b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -171,10 +171,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
 ; GFX11-NEXT:  ; %bb.9:
 ; GFX11-NEXT:    s_xor_b32 s0, s8, -1
-; GFX11-NEXT:  .LBB2_10: ; %bb17
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:  .LBB2_10: ; %bb17
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
 ; GFX11-NEXT:  ; %bb.11: ; %Flow6
 ; GFX11-NEXT:    s_mov_b32 s18, -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674acec..5059081422d66c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -10,8 +10,15 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_mov_b32 s8, s4
+; GFX10-NEXT:    s_mov_b32 s9, s4
+; GFX10-NEXT:    s_mov_b32 s10, s4
+; GFX10-NEXT:    s_mov_b32 s11, s4
+; GFX10-NEXT:    s_mov_b32 s12, s4
+; GFX10-NEXT:    s_mov_b32 s13, s4
+; GFX10-NEXT:    s_mov_b32 s14, s4
+; GFX10-NEXT:    s_mov_b32 s15, s4
 ; GFX10-NEXT:    ; implicit-def: $sgpr2
-; GFX10-NEXT:    s_inst_prefetch 0x1
 ; GFX10-NEXT:    s_branch .LBB0_2
 ; GFX10-NEXT:    .p2align 6
 ; GFX10-NEXT:  .LBB0_1: ; %Flow
@@ -32,14 +39,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_mov_b32 s5, s4
 ; GFX10-NEXT:    s_mov_b32 s6, s4
 ; GFX10-NEXT:    s_mov_b32 s7, s4
-; GFX10-NEXT:    s_mov_b32 s8, s4
-; GFX10-NEXT:    s_mov_b32 s9, s4
-; GFX10-NEXT:    s_mov_b32 s10, s4
-; GFX10-NEXT:    s_mov_b32 s11, s4
-; GFX10-NEXT:    s_mov_b32 s12, s4
-; GFX10-NEXT:    s_mov_b32 s13, s4
-; GFX10-NEXT:    s_mov_b32 s14, s4
-; GFX10-NEXT:    s_mov_b32 s15, s4
 ; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -49,7 +48,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_or_b32 s2, s2, s0
 ; GFX10-NEXT:    s_branch .LBB0_1
 ; GFX10-NEXT:  .LBB0_4: ; %loop0_merge
-; GFX10-NEXT:    s_inst_prefetch 0x2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: _amdgpu_cs_main:
@@ -59,6 +57,14 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:    s_mov_b32 s8, s4
+; GFX12-NEXT:    s_mov_b32 s9, s4
+; GFX12-NEXT:    s_mov_b32 s10, s4
+; GFX12-NEXT:    s_mov_b32 s11, s4
+; GFX12-NEXT:    s_mov_b32 s12, s4
+; GFX12-NEXT:    s_mov_b32 s13, s4
+; GFX12-NEXT:    s_mov_b32 s14, s4
+; GFX12-NEXT:    s_mov_b32 s15, s4
 ; GFX12-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-NEXT:    s_branch .LBB0_2
 ; GFX12-NEXT:  .LBB0_1: ; %Flow
@@ -80,14 +86,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    s_mov_b32 s5, s4
 ; GFX12-NEXT:    s_mov_b32 s6, s4
 ; GFX12-NEXT:    s_mov_b32 s7, s4
-; GFX12-NEXT:    s_mov_b32 s8, s4
-; GFX12-NEXT:    s_mov_b32 s9, s4
-; GFX12-NEXT:    s_mov_b32 s10, s4
-; GFX12-NEXT:    s_mov_b32 s11, s4
-; GFX12-NEXT:    s_mov_b32 s12, s4
-; GFX12-NEXT:    s_mov_b32 s13, s4
-; GFX12-NEXT:    s_mov_b32 s14, s4
-; GFX12-NEXT:    s_mov_b32 s15, s4
 ; GFX12-NEXT:    s_and_not1_b32 s2, s2, exec_lo
 ; GFX12-NEXT:    image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 0acee5bd5ac19d..a3d1214100cd5b 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -56,9 +56,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
 ; CHECK-NEXT:    s_waitcnt expcnt(1)
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:  .LBB1_1: ; %bb9
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB1_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb11
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index 06a357eeaeb61e..2f...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/121769