[llvm] [AMDGPU] Hoist WMMA coexecution hazard V_NOPs from loops to preheaders (PR #176895)

Dark Steve via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 24 20:10:42 PST 2026


https://github.com/PrasoonMishra updated https://github.com/llvm/llvm-project/pull/176895

>From 35f1282aeb9f337aa742106e0e1d041d8fd789f9 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Tue, 20 Jan 2026 08:23:18 +0000
Subject: [PATCH 1/7] [AMDGPU] Hoist WMMA coexecution hazard V_NOPs from loops
 to preheaders

On GFX1250, V_NOPs inserted for WMMA coexecution hazards are placed at
the use-site. When the hazard-consuming instruction is inside a loop and
the WMMA is outside, these NOPs execute every iteration even though the
hazard only needs to be covered once.

This patch hoists the V_NOPs to the loop preheader, reducing executions
from N iterations to 1.

Example (assuming a hazard requiring K V_NOPs):
  Before:
    bb.0 (preheader): WMMA writes vgpr0
    bb.1 (loop):      V_NOP xK, VALU reads vgpr0, branch bb.1
                      -> K NOPs executed per iteration

  After:
    bb.0 (preheader): WMMA writes vgpr0, V_NOP xK
    bb.1 (loop):      VALU reads vgpr0, branch bb.1
                      -> K NOPs executed once

For nested loops, V_NOPs are hoisted to the outermost preheader where no
conflicting WMMA exists within the loop.

Hoisting is restricted to strict preheaders (not any single predecessor)
to avoid introducing V_NOPs on unrelated control flow paths.

MachineLoopInfo is computed lazily within GCNHazardRecognizer.

The optimization is controlled by -amdgpu-wmma-vnop-hoisting (default: on).
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 222 +++++++--
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  21 +
 .../test/CodeGen/AMDGPU/wmma-nop-hoisting.mir | 456 ++++++++++++++++--
 3 files changed, 620 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index d504d8618b90d..5ba511d3d5e0f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,14 +14,23 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/TargetParser/TargetParser.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "gcn-hazard-recognizer"
+
+STATISTIC(NumWMMANopsHoisted,
+          "Number of WMMA hazard V_NOPs hoisted from loops");
+STATISTIC(NumWMMAHoistingBailed,
+          "Number of WMMA hazards where V_NOP hoisting was not possible");
+
 namespace {
 
 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
@@ -50,6 +59,10 @@ static cl::opt<unsigned>
     NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
                cl::desc("Insert a s_nop x before every instruction"));
 
+static cl::opt<bool> EnableWMMAVnopHoisting(
+    "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
+    cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
+
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
@@ -1288,7 +1301,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixVALUTransUseHazard(MI);
   fixVALUTransCoexecutionHazards(MI);
   fixWMMAHazards(MI); // fall-through if co-execution is enabled.
-  emitVNops(MI, checkWMMACoexecutionHazards(MI));
+  fixWMMACoexecutionHazards(MI);
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
   fixRequiredExportPriority(MI);
@@ -2084,8 +2097,6 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
   if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
     return 0;
 
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
   // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
   // be in between the first WMMA and the second instruction to cover the hazard
   // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
@@ -2095,7 +2106,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
   const int VALUWaitStates[] = {4, 8, 2, 4};
   unsigned Category = 0;
 
-  auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+  auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
     if (!TII->isXDLWMMA(I))
       return false;
 
@@ -2103,24 +2114,10 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
     if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
       return false;
 
-    Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
-    Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
-    Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
-
-    // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
-    if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
-      return true;
-
-    if (SIInstrInfo::isSWMMAC(*MI)) {
-      Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
-      if (TRI->regsOverlap(D0, Idx1))
-        return true;
-    }
-
-    return false;
+    return hasWMMAToWMMARegOverlap(I, *MI);
   };
 
-  auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+  auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
     if (!TII->isXDLWMMA(I))
       return false;
 
@@ -2128,35 +2125,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
     if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
       return false;
 
-    // WMMA writes, VALU reads.
-    Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
-    for (const MachineOperand &ValuUse : MI->explicit_uses()) {
-      if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
-        return true;
-    }
-
-    auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
-    if (!ValuDst || !ValuDst->isReg())
-      return false;
-    Register D1 = ValuDst->getReg();
-
-    // WMMA writes, VALU writes.
-    if (TRI->regsOverlap(D0, D1))
-      return true;
-
-    // WMMA reads, VALU writes.
-    Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
-    Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
-    if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
-      return true;
-
-    if (SIInstrInfo::isSWMMAC(I)) {
-      Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
-      if (TRI->regsOverlap(D1, Idx0))
-        return true;
-    }
-
-    return false;
+    return hasWMMAToVALURegOverlap(I, *MI);
   };
 
   int Limit = 0;
@@ -2191,6 +2160,161 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
   return WaitStatesNeeded;
 }
 
+void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
+                                                      int Count) {
+  MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
+  const DebugLoc &DL =
+      InsertPt != MBB->end() ? InsertPt->getDebugLoc() : DebugLoc();
+
+  for (int i = 0; i < Count; ++i) {
+    BuildMI(*MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
+  }
+}
+
+bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
+    const MachineInstr &WMMA, const MachineInstr &MI) const {
+  Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
+  Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+  Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
+
+  // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
+  if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
+    return true;
+
+  if (SIInstrInfo::isSWMMAC(MI)) {
+    Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+    if (TRI.regsOverlap(D0, Idx1))
+      return true;
+  }
+  return false;
+}
+
+bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
+    const MachineInstr &WMMA, const MachineInstr &MI) const {
+  // WMMA writes, VALU reads.
+  Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
+  for (const MachineOperand &ValuUse : MI.explicit_uses()) {
+    if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
+      return true;
+  }
+
+  auto *ValuDst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (!ValuDst || !ValuDst->isReg())
+    return false;
+  Register D1 = ValuDst->getReg();
+
+  // WMMA writes, VALU writes.
+  if (TRI.regsOverlap(D0, D1))
+    return true;
+
+  // WMMA reads, VALU writes.
+  Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
+  Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
+  if (TRI.regsOverlap(A0, D1) || TRI.regsOverlap(B0, D1))
+    return true;
+
+  if (SIInstrInfo::isSWMMAC(WMMA)) {
+    Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
+    if (TRI.regsOverlap(D1, Idx0))
+      return true;
+  }
+  return false;
+}
+
+bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
+                                                 const MachineInstr &MI) const {
+  if (!TII.isXDLWMMA(I))
+    return false;
+
+  // Dispatch based on MI type
+  if (TII.isXDLWMMA(MI))
+    return hasWMMAToWMMARegOverlap(I, MI);
+  else if (isCoexecutableVALUInst(MI))
+    return hasWMMAToVALURegOverlap(I, MI);
+
+  return false;
+}
+
+bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
+                                              bool IncludeSubloops) {
+  // Scan loop for any WMMA that hazards MI.
+  // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
+  for (MachineBasicBlock *MBB : L->getBlocks()) {
+    if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
+      continue;
+    for (MachineInstr &I : *MBB) {
+      if (&I == MI)
+        continue;
+      if (isCoexecutionHazardFor(I, *MI))
+        return true;
+    }
+  }
+  return false;
+}
+
+void GCNHazardRecognizer::ensureLoopInfoAvailable() {
+  // Lazily compute MDT and MLI only when needed
+  if (MLI)
+    return;
+
+  OwnedMDT =
+      std::make_unique<MachineDominatorTree>(const_cast<MachineFunction &>(MF));
+  OwnedMLI = std::make_unique<MachineLoopInfo>();
+  OwnedMLI->analyze(*OwnedMDT);
+
+  MLI = OwnedMLI.get();
+}
+
+bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
+                                                    int WaitStatesNeeded) {
+  ensureLoopInfoAvailable();
+
+  MachineLoop *L = MLI->getLoopFor(MI->getParent());
+  if (!L) {
+    ++NumWMMAHoistingBailed;
+    return false;
+  }
+
+  // If innermost loop has WMMA hazard, we can't hoist at all
+  if (hasWMMAHazardInLoop(L, MI)) {
+    ++NumWMMAHoistingBailed;
+    return false;
+  }
+
+  // Find outermost loop with no internal hazard
+  MachineLoop *TargetLoop = L;
+  while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
+    if (hasWMMAHazardInLoop(Parent, MI, false))
+      break;             // Parent has hazard in its own blocks, stop here
+    TargetLoop = Parent; // Safe to hoist further out
+  }
+
+  // Need valid preheader to insert V_NOPs
+  MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
+  if (!Preheader) {
+    ++NumWMMAHoistingBailed;
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
+                    << " V_NOPs from loop to " << Preheader->getName() << "\n");
+
+  insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
+  NumWMMANopsHoisted += WaitStatesNeeded;
+  return true;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+  int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
+  if (WaitStatesNeeded <= 0)
+    return false;
+
+  if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
+    return true;
+
+  return emitVNops(MI, WaitStatesNeeded);
+}
+
 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (!ST.hasShift64HighRegBug())
     return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index d725134639cfe..0a66b4206ce4a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -15,9 +15,12 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include <list>
+#include <memory>
 
 namespace llvm {
 
@@ -49,6 +52,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const TargetSchedModel &TSchedModel;
+
+  // Loop info for V_NOP hoisting, computed on demand only when needed.
+  std::unique_ptr<MachineDominatorTree> OwnedMDT;
+  std::unique_ptr<MachineLoopInfo> OwnedMLI;
+  MachineLoopInfo *MLI = nullptr;
+
   bool RunLdsBranchVmemWARHazardFixup;
 
   /// RegUnits of uses in the current soft memory clause.
@@ -114,6 +123,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
   bool fixWMMAHazards(MachineInstr *MI);
   int checkWMMACoexecutionHazards(MachineInstr *MI);
+  bool fixWMMACoexecutionHazards(MachineInstr *MI);
+  void ensureLoopInfoAvailable();
+  bool tryHoistWMMAVnopsFromLoop(MachineInstr *MI, int WaitStatesNeeded);
+  bool hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
+                           bool IncludeSubloops = true);
+  bool hasWMMAToWMMARegOverlap(const MachineInstr &WMMA,
+                               const MachineInstr &MI) const;
+  bool hasWMMAToVALURegOverlap(const MachineInstr &WMMA,
+                               const MachineInstr &MI) const;
+  bool isCoexecutionHazardFor(const MachineInstr &I,
+                              const MachineInstr &MI) const;
+  void insertVnopsBeforeTerminator(MachineBasicBlock *MBB, int Count);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
   bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index 283b1a352d1d6..3feabeb8986ce 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -1,38 +1,212 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec -amdgpu-wmma-vnop-hoisting=false %s -o - | FileCheck -check-prefix=NOHOIST %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=HOIST %s
 
-# Test: WMMA outside both loops, VALU in inner loop
-# Currently NOPs are inserted inside the loop body (bb.2).
-# A future optimization could hoist these NOPs to the preheader (bb.0).
+# Test 1: WMMA outside loop, VALU inside loop
+# The NOPs should be hoisted from the loop body to the preheader
+---
+name: test_simple_loop_hoist
+body: |
+  ; NOHOIST-LABEL: name: test_simple_loop_hoist
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ;
+  ; HOIST-LABEL: name: test_simple_loop_hoist
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  bb.0:
+    successors: %bb.1
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    successors: %bb.1
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    S_BRANCH %bb.1
+...
+
+# Test 2: WMMA hazard INSIDE the loop; should NOT hoist
+---
+name: test_internal_hazard_no_hoist
+body: |
+  ; NOHOIST-LABEL: name: test_internal_hazard_no_hoist
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ;
+  ; HOIST-LABEL: name: test_internal_hazard_no_hoist
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  bb.0:
+    successors: %bb.1
+    S_BRANCH %bb.1
+  bb.1:
+    successors: %bb.1
+    ; WMMA inside the loop writes to vgpr16-23, VALU reads vgpr16
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    S_BRANCH %bb.1
+...
+
+# Test 3: WMMA in loop but no hazard
+---
+name: test_wmma_in_loop_no_conflict_hoist
+body: |
+  ; NOHOIST-LABEL: name: test_wmma_in_loop_no_conflict_hoist
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ;
+  ; HOIST-LABEL: name: test_wmma_in_loop_no_conflict_hoist
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  bb.0:
+    successors: %bb.1
+    ; External WMMA writes to vgpr16-23
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    successors: %bb.1
+    ; Loop WMMA writes to vgpr56-63 (different registers)
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+    ; This reads vgpr16 from the external WMMA
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    S_BRANCH %bb.1
+...
+
+# Test 4: WMMA outside both loops, VALU in inner loop
+# NOPs should be hoisted to the outermost preheader (bb.0)
 ---
 name: test_nested_loop_hoist_to_outermost
 body: |
-  ; CHECK-LABEL: name: test_nested_loop_hoist_to_outermost
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
-  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
-  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
-  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; NOHOIST-LABEL: name: test_nested_loop_hoist_to_outermost
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+  ; NOHOIST-NEXT:   S_BRANCH %bb.2
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.2:
+  ; NOHOIST-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.2
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.3:
+  ; NOHOIST-NEXT:   S_ENDPGM 0
+  ;
+  ; HOIST-LABEL: name: test_nested_loop_hoist_to_outermost
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+  ; HOIST-NEXT:   S_BRANCH %bb.2
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.2:
+  ; HOIST-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.2
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.3:
+  ; HOIST-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
     ; WMMA outside all loops - writes to vgpr16-23
@@ -54,3 +228,225 @@ body: |
     ; Exit block
     S_ENDPGM 0
 ...
+
+# Test 5: Triple nested loop - WMMA hazard in outer loop (L1)
+# VALU in innermost loop (L3) reads from WMMA in L1's body
+# NOPs should be hoisted to L2's preheader (bb.1)
+---
+name: test_triple_nested_hoist_to_intermediate
+body: |
+  ; NOHOIST-LABEL: name: test_triple_nested_hoist_to_intermediate
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.2(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.2
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.2:
+  ; NOHOIST-NEXT:   successors: %bb.3(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_BRANCH %bb.3
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.3:
+  ; NOHOIST-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.3
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.4:
+  ; NOHOIST-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit undef $scc
+  ; NOHOIST-NEXT:   S_BRANCH %bb.2
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.5:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x40000000), %bb.6(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.6:
+  ; NOHOIST-NEXT:   S_ENDPGM 0
+  ;
+  ; HOIST-LABEL: name: test_triple_nested_hoist_to_intermediate
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.2(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.2
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.2:
+  ; HOIST-NEXT:   successors: %bb.3(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_BRANCH %bb.3
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.3:
+  ; HOIST-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.3
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.4:
+  ; HOIST-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit undef $scc
+  ; HOIST-NEXT:   S_BRANCH %bb.2
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.5:
+  ; HOIST-NEXT:   successors: %bb.1(0x40000000), %bb.6(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.6:
+  ; HOIST-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+    S_BRANCH %bb.1
+  bb.1:
+    successors: %bb.2
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+  bb.2:
+    successors: %bb.3
+    S_BRANCH %bb.3
+  bb.3:
+    ; VALU reads vgpr16 from WMMA in bb.1
+    successors: %bb.3, %bb.4
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    S_CBRANCH_EXECZ %bb.4, implicit $exec
+    S_BRANCH %bb.3
+  bb.4:
+    ; L2 latch - back to L2 header or exit to L1 latch
+    successors: %bb.2, %bb.5
+    S_CBRANCH_SCC1 %bb.5, implicit undef $scc
+    S_BRANCH %bb.2
+  bb.5:
+    ; L1 latch - back to L1 header or exit
+    successors: %bb.1, %bb.6
+    S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+    S_BRANCH %bb.1
+  bb.6:
+    ; Exit
+    S_ENDPGM 0
+...
+
+# Test 6: No preheader (multiple predecessors) - cannot hoist
+---
+name: test_no_preheader_no_hoist
+body: |
+  ; NOHOIST-LABEL: name: test_no_preheader_no_hoist
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.2(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_BRANCH %bb.2
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.2:
+  ; NOHOIST-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.2
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.3:
+  ; NOHOIST-NEXT:   S_ENDPGM 0
+  ;
+  ; HOIST-LABEL: name: test_no_preheader_no_hoist
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.2(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_BRANCH %bb.2
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.2:
+  ; HOIST-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.2
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.3:
+  ; HOIST-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+    S_BRANCH %bb.1
+  bb.1:
+    successors: %bb.2
+    S_BRANCH %bb.2
+  bb.2:
+    ; Loop header with two predecessors (bb.0 and bb.1)
+    successors: %bb.2, %bb.3
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    S_CBRANCH_EXECZ %bb.3, implicit $exec
+    S_BRANCH %bb.2
+  bb.3:
+    S_ENDPGM 0
+...
+
+# Test 7: Not in a loop; should NOT hoist
+---
+name: test_not_in_loop_no_hoist
+body: |
+  bb.0:
+    ; NOHOIST-LABEL: name: test_not_in_loop_no_hoist
+    ; NOHOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+    ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+    ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+    ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+    ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    ;
+    ; HOIST-LABEL: name: test_not_in_loop_no_hoist
+    ; HOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; HOIST-NEXT: V_NOP_e32 implicit $exec
+    ; HOIST-NEXT: V_NOP_e32 implicit $exec
+    ; HOIST-NEXT: V_NOP_e32 implicit $exec
+    ; HOIST-NEXT: V_NOP_e32 implicit $exec
+    ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+

>From 95f7b772cf3a81f79876a4c1913dd5900466b511 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Tue, 20 Jan 2026 16:04:05 +0000
Subject: [PATCH 2/7] Fix: - Drop DebugLoc from hositd V_Nops - Use
 printMBBReference

---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5ba511d3d5e0f..1680286e66612 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2163,11 +2163,8 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
 void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
                                                       int Count) {
   MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
-  const DebugLoc &DL =
-      InsertPt != MBB->end() ? InsertPt->getDebugLoc() : DebugLoc();
-
   for (int i = 0; i < Count; ++i) {
-    BuildMI(*MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
+    BuildMI(*MBB, InsertPt, DebugLoc(), TII.get(AMDGPU::V_NOP_e32));
   }
 }
 
@@ -2297,7 +2294,8 @@ bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
   }
 
   LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
-                    << " V_NOPs from loop to " << Preheader->getName() << "\n");
+                    << " V_NOPs from loop to " << printMBBReference(*Preheader)
+                    << "\n");
 
   insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
   NumWMMANopsHoisted += WaitStatesNeeded;

>From e1c6bcd30b0306eeec0f612f20dd212c752ef463 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Wed, 21 Jan 2026 03:56:03 +0000
Subject: [PATCH 3/7] Used existing emitVnops and added a test for preheader
 without terminator.

---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 30 ++++--------
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  8 ++--
 .../test/CodeGen/AMDGPU/wmma-nop-hoisting.mir | 46 ++++++++++++++++++-
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 1680286e66612..bf60330aa0ad8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1273,18 +1273,12 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
 }
 
-// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
-// to insert, negative means not needed.
-bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
-  if (WaitStatesNeeded <= 0)
-    return false;
-
-  const SIInstrInfo *TII = ST.getInstrInfo();
+void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertPt,
+                                    int WaitStatesNeeded, bool IsHoisting) {
+  const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
   for (int I = 0; I < WaitStatesNeeded; ++I)
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-            TII->get(AMDGPU::V_NOP_e32));
-
-  return true;
+    BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
 }
 
 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
@@ -2160,14 +2154,6 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
   return WaitStatesNeeded;
 }
 
-void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
-                                                      int Count) {
-  MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
-  for (int i = 0; i < Count; ++i) {
-    BuildMI(*MBB, InsertPt, DebugLoc(), TII.get(AMDGPU::V_NOP_e32));
-  }
-}
-
 bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
     const MachineInstr &WMMA, const MachineInstr &MI) const {
   Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
@@ -2297,7 +2283,8 @@ bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
                     << " V_NOPs from loop to " << printMBBReference(*Preheader)
                     << "\n");
 
-  insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
+  emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
+            /*IsHoisting=*/true);
   NumWMMANopsHoisted += WaitStatesNeeded;
   return true;
 }
@@ -2310,7 +2297,8 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
   if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
     return true;
 
-  return emitVNops(MI, WaitStatesNeeded);
+  emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
+  return true;
 }
 
 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 0a66b4206ce4a..87a9aba69ec32 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -107,9 +107,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   int checkReadM0Hazards(MachineInstr *SMovRel);
   int checkNSAtoVMEMHazard(MachineInstr *MI);
   int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
-  // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we
-  // need to insert, negative means not needed.
-  bool emitVNops(MachineInstr *MI, int WaitStatesNeeded);
+  // Emit \p WaitStatesNeeded V_NOP instructions before \p InsertPt.
+  // If IsHoisting is true, uses empty DebugLoc for compiler-inserted NOPs.
+  void emitVNops(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+                 int WaitStatesNeeded, bool IsHoisting = false);
   void fixHazards(MachineInstr *MI);
   bool fixVcmpxPermlaneHazards(MachineInstr *MI);
   bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
@@ -134,7 +135,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
                                const MachineInstr &MI) const;
   bool isCoexecutionHazardFor(const MachineInstr &I,
                               const MachineInstr &MI) const;
-  void insertVnopsBeforeTerminator(MachineBasicBlock *MBB, int Count);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
   bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index 3feabeb8986ce..e0ebbf0b53d9f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -426,7 +426,51 @@ body: |
     S_ENDPGM 0
 ...
 
-# Test 7: Not in a loop; should NOT hoist
+# Test 7: Preheader without terminator
+---
+name: test_fallthrough_preheader_hoist
+body: |
+  ; NOHOIST-LABEL: name: test_fallthrough_preheader_hoist
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ;
+  ; HOIST-LABEL: name: test_fallthrough_preheader_hoist
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  bb.0:
+    successors: %bb.1
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  bb.1:
+    successors: %bb.1
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    S_BRANCH %bb.1
+...
+
+# Test 8: Not in a loop; should NOT hoist
 ---
 name: test_not_in_loop_no_hoist
 body: |

>From 58c4ab7d0eb97756c0d49fb42fc4253de19e067d Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Wed, 21 Jan 2026 09:58:38 +0000
Subject: [PATCH 4/7] Take MLI from Pass manager and small nitfix

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  4 ++-
 llvm/lib/CodeGen/PostRAHazardRecognizer.cpp   | 21 ++++++++++------
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 25 ++++++-------------
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  8 +++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  5 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  3 ++-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  9 +++++++
 7 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 45713360d44de..ed968d7bd4593 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -47,6 +47,7 @@ class InstrItineraryData;
 class LiveIntervals;
 class LiveVariables;
 class MachineLoop;
+class MachineLoopInfo;
 class MachineMemOperand;
 class MachineModuleInfo;
 class MachineRegisterInfo;
@@ -1785,7 +1786,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// Allocate and return a hazard recognizer to use for by non-scheduling
   /// passes.
   virtual ScheduleHazardRecognizer *
-  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
+                                     MachineLoopInfo *MLI) const {
     return nullptr;
   }
 
diff --git a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
index 29cfc06d90b29..906eea29f1a15 100644
--- a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/PostRAHazardRecognizer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -42,7 +43,7 @@ STATISTIC(NumNoops, "Number of noops inserted");
 
 namespace {
 struct PostRAHazardRecognizer {
-  bool run(MachineFunction &MF);
+  bool run(MachineFunction &MF, MachineLoopInfo *MLI);
 };
 
 class PostRAHazardRecognizerLegacy : public MachineFunctionPass {
@@ -53,11 +54,13 @@ class PostRAHazardRecognizerLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
   bool runOnMachineFunction(MachineFunction &Fn) override {
-    return PostRAHazardRecognizer().run(Fn);
+    MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+    return PostRAHazardRecognizer().run(Fn, &MLI);
   }
 };
 char PostRAHazardRecognizerLegacy::ID = 0;
@@ -66,13 +69,17 @@ char PostRAHazardRecognizerLegacy::ID = 0;
 
 char &llvm::PostRAHazardRecognizerID = PostRAHazardRecognizerLegacy::ID;
 
-INITIALIZE_PASS(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
-                "Post RA hazard recognizer", false, false)
+INITIALIZE_PASS_BEGIN(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
+                      "Post RA hazard recognizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_END(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
+                    "Post RA hazard recognizer", false, false)
 
 PreservedAnalyses
 llvm::PostRAHazardRecognizerPass::run(MachineFunction &MF,
                                       MachineFunctionAnalysisManager &MFAM) {
-  if (!PostRAHazardRecognizer().run(MF))
+  MachineLoopInfo *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+  if (!PostRAHazardRecognizer().run(MF, MLI))
     return PreservedAnalyses::all();
 
   auto PA = getMachineFunctionPassPreservedAnalyses();
@@ -80,10 +87,10 @@ llvm::PostRAHazardRecognizerPass::run(MachineFunction &MF,
   return PA;
 }
 
-bool PostRAHazardRecognizer::run(MachineFunction &Fn) {
+bool PostRAHazardRecognizer::run(MachineFunction &Fn, MachineLoopInfo *MLI) {
   const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
   std::unique_ptr<ScheduleHazardRecognizer> HazardRec(
-      TII->CreateTargetPostRAHazardRecognizer(Fn));
+      TII->CreateTargetPostRAHazardRecognizer(Fn, MLI));
 
   // Return if the target has not implemented a hazard recognizer.
   if (!HazardRec)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bf60330aa0ad8..57d60e0c21dee 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -70,10 +70,11 @@ static cl::opt<bool> EnableWMMAVnopHoisting(
 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
                                                  const GCNSubtarget &ST);
 
-GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF,
+                                         MachineLoopInfo *MLI)
     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
-      TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
+      TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -2206,13 +2207,15 @@ bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
 
 bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
                                                  const MachineInstr &MI) const {
+  // I is the potential WMMA hazard source, MI is the instruction being checked
+  // for hazard.
   if (!TII.isXDLWMMA(I))
     return false;
 
   // Dispatch based on MI type
   if (TII.isXDLWMMA(MI))
     return hasWMMAToWMMARegOverlap(I, MI);
-  else if (isCoexecutableVALUInst(MI))
+  if (isCoexecutableVALUInst(MI))
     return hasWMMAToVALURegOverlap(I, MI);
 
   return false;
@@ -2235,22 +2238,10 @@ bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
   return false;
 }
 
-void GCNHazardRecognizer::ensureLoopInfoAvailable() {
-  // Lazily compute MDT and MLI only when needed
-  if (MLI)
-    return;
-
-  OwnedMDT =
-      std::make_unique<MachineDominatorTree>(const_cast<MachineFunction &>(MF));
-  OwnedMLI = std::make_unique<MachineLoopInfo>();
-  OwnedMLI->analyze(*OwnedMDT);
-
-  MLI = OwnedMLI.get();
-}
-
 bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
                                                     int WaitStatesNeeded) {
-  ensureLoopInfoAvailable();
+  if (!MLI)
+    return false;
 
   MachineLoop *L = MLI->getLoopFor(MI->getParent());
   if (!L) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 87a9aba69ec32..3f8c43673a3ba 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -53,9 +53,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   const SIRegisterInfo &TRI;
   const TargetSchedModel &TSchedModel;
 
-  // Loop info for V_NOP hoisting, computed on demand only when needed.
-  std::unique_ptr<MachineDominatorTree> OwnedMDT;
-  std::unique_ptr<MachineLoopInfo> OwnedMLI;
+  // Loop info for V_NOP hoisting, passed from the pass manager.
   MachineLoopInfo *MLI = nullptr;
 
   bool RunLdsBranchVmemWARHazardFixup;
@@ -125,7 +123,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixWMMAHazards(MachineInstr *MI);
   int checkWMMACoexecutionHazards(MachineInstr *MI);
   bool fixWMMACoexecutionHazards(MachineInstr *MI);
-  void ensureLoopInfoAvailable();
   bool tryHoistWMMAVnopsFromLoop(MachineInstr *MI, int WaitStatesNeeded);
   bool hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
                            bool IncludeSubloops = true);
@@ -167,7 +164,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   int checkPermlaneHazards(MachineInstr *MI);
 
 public:
-  GCNHazardRecognizer(const MachineFunction &MF);
+  GCNHazardRecognizer(const MachineFunction &MF,
+                      MachineLoopInfo *MLI = nullptr);
   // We can only issue one instruction per cycle.
   bool atIssueLimit() const override { return true; }
   void EmitInstruction(SUnit *SU) override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9180d5fc8bcf0..426f97bfff676 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9911,8 +9911,9 @@ SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
 /// pass.
 ScheduleHazardRecognizer *
-SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
-  return new GCNHazardRecognizer(MF);
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
+                                                MachineLoopInfo *MLI) const {
+  return new GCNHazardRecognizer(MF, MLI);
 }
 
 // Called during:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0b54513bb6114..d7c149ad0c355 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1582,7 +1582,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                  const ScheduleDAG *DAG) const override;
 
   ScheduleHazardRecognizer *
-  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
+                                     MachineLoopInfo *MLI) const override;
 
   ScheduleHazardRecognizer *
   CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 2904ba604fb1b..0e7caad5786d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,6 +146,7 @@
 ; GCN-O0-NEXT:        SI insert wait instructions
 ; GCN-O0-NEXT:        Insert required mode register values
 ; GCN-O0-NEXT:        SI Final Branch Preparation
+; GCN-O0-NEXT:        Machine Natural Loop Construction
 ; GCN-O0-NEXT:        Post RA hazard recognizer
 ; GCN-O0-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O0-NEXT:        AMDGPU Lower VGPR Encoding
@@ -438,6 +439,8 @@
 ; GCN-O1-NEXT:        SI Insert Hard Clauses
 ; GCN-O1-NEXT:        SI Final Branch Preparation
 ; GCN-O1-NEXT:        SI peephole optimizations
+; GCN-O1-NEXT:        MachineDominator Tree Construction
+; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Post RA hazard recognizer
 ; GCN-O1-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O1-NEXT:        AMDGPU Lower VGPR Encoding
@@ -758,6 +761,8 @@
 ; GCN-O1-OPTS-NEXT:        SI Insert Hard Clauses
 ; GCN-O1-OPTS-NEXT:        SI Final Branch Preparation
 ; GCN-O1-OPTS-NEXT:        SI peephole optimizations
+; GCN-O1-OPTS-NEXT:        MachineDominator Tree Construction
+; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Post RA hazard recognizer
 ; GCN-O1-OPTS-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O1-OPTS-NEXT:        AMDGPU Lower VGPR Encoding
@@ -1083,6 +1088,8 @@
 ; GCN-O2-NEXT:        SI Insert Hard Clauses
 ; GCN-O2-NEXT:        SI Final Branch Preparation
 ; GCN-O2-NEXT:        SI peephole optimizations
+; GCN-O2-NEXT:        MachineDominator Tree Construction
+; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Post RA hazard recognizer
 ; GCN-O2-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O2-NEXT:        AMDGPU Lower VGPR Encoding
@@ -1421,6 +1428,8 @@
 ; GCN-O3-NEXT:        SI Insert Hard Clauses
 ; GCN-O3-NEXT:        SI Final Branch Preparation
 ; GCN-O3-NEXT:        SI peephole optimizations
+; GCN-O3-NEXT:        MachineDominator Tree Construction
+; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Post RA hazard recognizer
 ; GCN-O3-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O3-NEXT:        AMDGPU Lower VGPR Encoding

>From 87d6471ef6ae6e75a1d20588af4acc97e181d09a Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Wed, 21 Jan 2026 11:19:59 +0000
Subject: [PATCH 5/7] Remove memory & MachineDominators header as there is no
 need now.

---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 3f8c43673a3ba..b331504d40113 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -15,12 +15,10 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include <list>
-#include <memory>
 
 namespace llvm {
 

>From 8255ab4787df91be737c66377ec1072df38815bc Mon Sep 17 00:00:00 2001
From: Dark Steve <Prasoon.Mishra at amd.com>
Date: Wed, 18 Feb 2026 06:38:43 +0000
Subject: [PATCH 6/7] Reverted the changes of llc-pipeline as now late
 branching and peephole preserve MLI.

---
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0e7caad5786d6..2904ba604fb1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,7 +146,6 @@
 ; GCN-O0-NEXT:        SI insert wait instructions
 ; GCN-O0-NEXT:        Insert required mode register values
 ; GCN-O0-NEXT:        SI Final Branch Preparation
-; GCN-O0-NEXT:        Machine Natural Loop Construction
 ; GCN-O0-NEXT:        Post RA hazard recognizer
 ; GCN-O0-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O0-NEXT:        AMDGPU Lower VGPR Encoding
@@ -439,8 +438,6 @@
 ; GCN-O1-NEXT:        SI Insert Hard Clauses
 ; GCN-O1-NEXT:        SI Final Branch Preparation
 ; GCN-O1-NEXT:        SI peephole optimizations
-; GCN-O1-NEXT:        MachineDominator Tree Construction
-; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Post RA hazard recognizer
 ; GCN-O1-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O1-NEXT:        AMDGPU Lower VGPR Encoding
@@ -761,8 +758,6 @@
 ; GCN-O1-OPTS-NEXT:        SI Insert Hard Clauses
 ; GCN-O1-OPTS-NEXT:        SI Final Branch Preparation
 ; GCN-O1-OPTS-NEXT:        SI peephole optimizations
-; GCN-O1-OPTS-NEXT:        MachineDominator Tree Construction
-; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Post RA hazard recognizer
 ; GCN-O1-OPTS-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O1-OPTS-NEXT:        AMDGPU Lower VGPR Encoding
@@ -1088,8 +1083,6 @@
 ; GCN-O2-NEXT:        SI Insert Hard Clauses
 ; GCN-O2-NEXT:        SI Final Branch Preparation
 ; GCN-O2-NEXT:        SI peephole optimizations
-; GCN-O2-NEXT:        MachineDominator Tree Construction
-; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Post RA hazard recognizer
 ; GCN-O2-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O2-NEXT:        AMDGPU Lower VGPR Encoding
@@ -1428,8 +1421,6 @@
 ; GCN-O3-NEXT:        SI Insert Hard Clauses
 ; GCN-O3-NEXT:        SI Final Branch Preparation
 ; GCN-O3-NEXT:        SI peephole optimizations
-; GCN-O3-NEXT:        MachineDominator Tree Construction
-; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Post RA hazard recognizer
 ; GCN-O3-NEXT:        AMDGPU Insert waits for SGPR read hazards
 ; GCN-O3-NEXT:        AMDGPU Lower VGPR Encoding

>From 3ca4039ebb29378a12798041e4225a08fb32f282 Mon Sep 17 00:00:00 2001
From: Dark Steve <Prasoon.Mishra at amd.com>
Date: Wed, 25 Feb 2026 04:06:25 +0000
Subject: [PATCH 7/7] Added test where hazard is via back-edge

---
 .../test/CodeGen/AMDGPU/wmma-nop-hoisting.mir | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index e0ebbf0b53d9f..9918c9077d1b7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -494,3 +494,49 @@ body: |
     $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
 ...
 
+# Test 9: VALU first, then WMMA in same loop (hazard via back-edge)
+---
+name: test_valu_before_wmma_backedge_no_hoist
+body: |
+  ; NOHOIST-LABEL: name: test_valu_before_wmma_backedge_no_hoist
+  ; NOHOIST: bb.0:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT: bb.1:
+  ; NOHOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; NOHOIST-NEXT: {{  $}}
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; NOHOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; NOHOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; NOHOIST-NEXT:   S_BRANCH %bb.1
+  ;
+  ; HOIST-LABEL: name: test_valu_before_wmma_backedge_no_hoist
+  ; HOIST: bb.0:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT: bb.1:
+  ; HOIST-NEXT:   successors: %bb.1(0x80000000)
+  ; HOIST-NEXT: {{  $}}
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   V_NOP_e32 implicit $exec
+  ; HOIST-NEXT:   $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+  ; HOIST-NEXT:   early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+  ; HOIST-NEXT:   S_BRANCH %bb.1
+  bb.0:
+    successors: %bb.1
+    S_BRANCH %bb.1
+  bb.1:
+    successors: %bb.1
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+...



More information about the llvm-commits mailing list