[llvm] [CodeGen] MachineLICM: Do not consider "loop liveins" as loop defs (PR #121769)

Mon Jan 6 07:20:41 PST 2025

https://github.com/gbossu updated https://github.com/llvm/llvm-project/pull/121769

>From 7d8ed8a9824dedbb365612ecf7caca2ee6172490 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at amd.com>
Date: Mon, 6 Jan 2025 10:32:55 +0000
Subject: [PATCH] [CodeGen] MachineLICM: Do not consider "loop liveins" as loop
 defs

This allows to hoist instructions using registers that are not
re-defined in the loop. Previous MachineLICM basically could not hoist
any instruction using register inputs.
---
 llvm/lib/CodeGen/MachineLICM.cpp              |  20 +-
 .../CodeGen/AArch64/machinelicm-post-ra.mir   | 217 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/peephole-and-tst.ll |   2 +-
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  |  16 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |   4 +-
 .../CodeGen/AMDGPU/no-dup-inst-prefetch.ll    |  34 ++-
 .../AMDGPU/undefined-subreg-liverange.ll      |   3 +-
 llvm/test/CodeGen/RISCV/rvv/pr95865.ll        |   2 +-
 .../InferAddressSpaces/AMDGPU/flat_atomic.ll  |   3 +-
 9 files changed, 261 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..0ae1f98468a3dc 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -550,6 +550,8 @@ void MachineLICMImpl::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
         for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
+          // Note this isn't a final check, as we haven't gathered all the loop
+          // register definitions yet.
           if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
             HasNonInvariantUse = true;
             break;
@@ -627,14 +629,6 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
     const MachineLoop *ML = MLI->getLoopFor(BB);
     if (ML && ML->getHeader()->isEHPad()) continue;
 
-    // Conservatively treat live-in's as an external def.
-    // FIXME: That means a reload that're reused in successor block(s) will not
-    // be LICM'ed.
-    for (const auto &LI : BB->liveins()) {
-      for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
-        RUDefs.set(*RUI);
-    }
-
     // Funclet entry blocks will clobber all registers
     if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
       applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
@@ -644,6 +638,16 @@ void MachineLICMImpl::HoistRegionPostRA(MachineLoop *CurLoop,
       ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
   }
 
+  // Mark registers as clobbered if they are livein and also defined in the loop
+  for (const auto &LoopLI : CurLoop->getHeader()->liveins()) {
+    MCPhysReg LoopLiveInReg = LoopLI.PhysReg;
+    for (MCRegUnitIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) {
+      if (RUDefs.test(*RUI)) {
+        RUClobbers.set(*RUI);
+      }
+    }
+  }
+
   // Gather the registers read / clobbered by the terminator.
   BitVector TermRUs(NumRegUnits);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
diff --git a/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
new file mode 100644
index 00000000000000..49d569f73a83ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machinelicm-post-ra.mir
@@ -0,0 +1,217 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machinelicm -o - %s | FileCheck %s
+
+# A couple of "unit" tests for the post-ra MachineLICM pass.
+
+# Positive test to show MachineLICM can hoist instructions with register operands.
+---
+name:            hoist_invariant_add
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: hoist_invariant_add
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri killed renamable $w1, 1, 0
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
+
+# The first ADDWri does not have loop-invariant source operands,
+# it cannot be hoisted.
+---
+name:            nohoist_variable_add_operands
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: nohoist_variable_add_operands
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri renamable $w1, 1, 0
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   renamable $w1 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri renamable $w1, 1, 0
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    renamable $w1 = ADDWri killed renamable $w1, 1, 0
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
+
+# w2 is a loop-livein, but it is also redefined in the loop by ADDWri.
+# The latter cannot be hoisted.
+---
+name:            nohoist_redef_livein_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: nohoist_redef_livein_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 0, 0
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri killed renamable $w1, 0, 0
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1, $w2
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    renamable $w2 = ADDWri killed renamable $w1, 1, 0
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
+
+# $w2 = ADDWri $w1, 1 cannot be hoisted because w2 is later clobbered
+# through an assignment to x2 (an alias of w2).
+---
+name:            nohoist_clobber_through_alias_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: nohoist_clobber_through_alias_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w2 = ADDWri killed renamable $w1, 1, 0
+  ; CHECK-NEXT:   renamable $w0 = SUBWrr killed renamable $w0, $w2
+  ; CHECK-NEXT:   $x2 = MOVi64imm 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+    CBZW renamable $w0, %bb.3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $w0, $w1
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $w0, $w1
+    renamable $w2 = ADDWri killed renamable $w1, 1, 0
+    renamable $w0 = SUBWrr killed renamable $w0, $w2
+    $x2 = MOVi64imm 0
+    CBZW renamable $w0, %bb.3
+    B %bb.2
+
+  bb.3:
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 17ad2983abe905..c9a89bd7b6e4fc 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -9,6 +9,7 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-LABEL: test_func_i32_two_uses:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:ptr_wrapper
+; CHECK-NEXT:    and w11, w2, w0
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:ptr_wrapper]
 ; CHECK-NEXT:    ldr x9, [x8]
 ; CHECK-NEXT:    mov w8, wzr
@@ -21,7 +22,6 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-NEXT:  .LBB0_3: // %do.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ands w10, w1, w0
-; CHECK-NEXT:    and w11, w2, w0
 ; CHECK-NEXT:    cinc w8, w8, ne
 ; CHECK-NEXT:    cmp w10, w11
 ; CHECK-NEXT:    b.eq .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e71c6cf71c8823..1c55b94f527134 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -9307,6 +9307,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
+; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-MOVREL-NEXT:    s_branch .LBB26_2
 ; SI-MOVREL-NEXT:  .LBB26_1:
 ; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0
@@ -9314,14 +9316,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; SI-MOVREL-NEXT:  .LBB26_2: ; %bb2
 ; SI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
 ; SI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
-; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; SI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
 ; SI-MOVREL-NEXT:  ; %bb.3: ; %bb4
 ; SI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; SI-MOVREL-NEXT:    buffer_load_dword v16, off, s[0:3], 0 glc
 ; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], exec
 ; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9343,6 +9343,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-MOVREL:       ; %bb.0: ; %bb
 ; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
+; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-MOVREL-NEXT:    s_branch .LBB26_2
 ; VI-MOVREL-NEXT:  .LBB26_1:
 ; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0
@@ -9350,14 +9352,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-MOVREL-NEXT:  .LBB26_2: ; %bb2
 ; VI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
 ; VI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
-; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; VI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
 ; VI-MOVREL-NEXT:  ; %bb.3: ; %bb4
 ; VI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; VI-MOVREL-NEXT:    flat_load_dword v16, v[0:1] glc
 ; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-MOVREL-NEXT:    s_mov_b64 s[2:3], exec
 ; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9379,6 +9379,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-IDXMODE:       ; %bb.0: ; %bb
 ; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
+; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-IDXMODE-NEXT:    s_branch .LBB26_2
 ; VI-IDXMODE-NEXT:  .LBB26_1:
 ; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0
@@ -9386,14 +9388,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-IDXMODE-NEXT:  .LBB26_2: ; %bb2
 ; VI-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
 ; VI-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
-; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; VI-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
 ; VI-IDXMODE-NEXT:  ; %bb.3: ; %bb4
 ; VI-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; VI-IDXMODE-NEXT:    flat_load_dword v16, v[0:1] glc
 ; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
 ; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
@@ -9416,6 +9416,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GFX9-IDXMODE:       ; %bb.0: ; %bb
 ; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX9-IDXMODE-NEXT:    s_branch .LBB26_2
 ; GFX9-IDXMODE-NEXT:  .LBB26_1:
 ; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0
@@ -9423,14 +9425,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GFX9-IDXMODE-NEXT:  .LBB26_2: ; %bb2
 ; GFX9-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX9-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
-; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; GFX9-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
 ; GFX9-IDXMODE-NEXT:  ; %bb.3: ; %bb4
 ; GFX9-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; GFX9-IDXMODE-NEXT:    global_load_dword v16, v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX9-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 5c5a769178dd94..61fcf7be8efa4b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -171,10 +171,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
 ; GFX11-NEXT:  ; %bb.9:
 ; GFX11-NEXT:    s_xor_b32 s0, s8, -1
-; GFX11-NEXT:  .LBB2_10: ; %bb17
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:  .LBB2_10: ; %bb17
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
 ; GFX11-NEXT:  ; %bb.11: ; %Flow6
 ; GFX11-NEXT:    s_mov_b32 s18, -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674acec..5059081422d66c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -10,8 +10,15 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_mov_b32 s8, s4
+; GFX10-NEXT:    s_mov_b32 s9, s4
+; GFX10-NEXT:    s_mov_b32 s10, s4
+; GFX10-NEXT:    s_mov_b32 s11, s4
+; GFX10-NEXT:    s_mov_b32 s12, s4
+; GFX10-NEXT:    s_mov_b32 s13, s4
+; GFX10-NEXT:    s_mov_b32 s14, s4
+; GFX10-NEXT:    s_mov_b32 s15, s4
 ; GFX10-NEXT:    ; implicit-def: $sgpr2
-; GFX10-NEXT:    s_inst_prefetch 0x1
 ; GFX10-NEXT:    s_branch .LBB0_2
 ; GFX10-NEXT:    .p2align 6
 ; GFX10-NEXT:  .LBB0_1: ; %Flow
@@ -32,14 +39,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_mov_b32 s5, s4
 ; GFX10-NEXT:    s_mov_b32 s6, s4
 ; GFX10-NEXT:    s_mov_b32 s7, s4
-; GFX10-NEXT:    s_mov_b32 s8, s4
-; GFX10-NEXT:    s_mov_b32 s9, s4
-; GFX10-NEXT:    s_mov_b32 s10, s4
-; GFX10-NEXT:    s_mov_b32 s11, s4
-; GFX10-NEXT:    s_mov_b32 s12, s4
-; GFX10-NEXT:    s_mov_b32 s13, s4
-; GFX10-NEXT:    s_mov_b32 s14, s4
-; GFX10-NEXT:    s_mov_b32 s15, s4
 ; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -49,7 +48,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_or_b32 s2, s2, s0
 ; GFX10-NEXT:    s_branch .LBB0_1
 ; GFX10-NEXT:  .LBB0_4: ; %loop0_merge
-; GFX10-NEXT:    s_inst_prefetch 0x2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: _amdgpu_cs_main:
@@ -59,6 +57,14 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:    s_mov_b32 s8, s4
+; GFX12-NEXT:    s_mov_b32 s9, s4
+; GFX12-NEXT:    s_mov_b32 s10, s4
+; GFX12-NEXT:    s_mov_b32 s11, s4
+; GFX12-NEXT:    s_mov_b32 s12, s4
+; GFX12-NEXT:    s_mov_b32 s13, s4
+; GFX12-NEXT:    s_mov_b32 s14, s4
+; GFX12-NEXT:    s_mov_b32 s15, s4
 ; GFX12-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-NEXT:    s_branch .LBB0_2
 ; GFX12-NEXT:  .LBB0_1: ; %Flow
@@ -80,14 +86,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    s_mov_b32 s5, s4
 ; GFX12-NEXT:    s_mov_b32 s6, s4
 ; GFX12-NEXT:    s_mov_b32 s7, s4
-; GFX12-NEXT:    s_mov_b32 s8, s4
-; GFX12-NEXT:    s_mov_b32 s9, s4
-; GFX12-NEXT:    s_mov_b32 s10, s4
-; GFX12-NEXT:    s_mov_b32 s11, s4
-; GFX12-NEXT:    s_mov_b32 s12, s4
-; GFX12-NEXT:    s_mov_b32 s13, s4
-; GFX12-NEXT:    s_mov_b32 s14, s4
-; GFX12-NEXT:    s_mov_b32 s15, s4
 ; GFX12-NEXT:    s_and_not1_b32 s2, s2, exec_lo
 ; GFX12-NEXT:    image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 0acee5bd5ac19d..a3d1214100cd5b 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -56,9 +56,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
 ; CHECK-NEXT:    s_waitcnt expcnt(1)
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:  .LBB1_1: ; %bb9
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB1_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb11
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index 06a357eeaeb61e..2f8efc511632b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -46,6 +46,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    andi t3, a4, 1
 ; CHECK-NEXT:    li t2, 4
+; CHECK-NEXT:    andi s10, a0, 1
 ; CHECK-NEXT:  .LBB0_1: # %for.cond1.preheader.i
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_2 Depth 2
@@ -105,7 +106,6 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    addi s8, s8, 4
 ; CHECK-NEXT:    addi ra, ra, 4
 ; CHECK-NEXT:    addi a3, a3, 4
-; CHECK-NEXT:    andi s10, a0, 1
 ; CHECK-NEXT:    addi s11, s11, 4
 ; CHECK-NEXT:    beqz s10, .LBB0_4
 ; CHECK-NEXT:  # %bb.7: # %for.cond.cleanup11.i
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index 6633cec659d8e5..90778606a8e5b3 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -129,9 +129,10 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:  .LBB3_1: ; %bb0
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb1
 ; CHECK-NEXT:    s_mov_b64 s[0:1], exec