[llvm] [AMDGPU] Hoist WMMA coexecution hazard V_NOPs from loops to preheaders (PR #176895)
Dark Steve via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 24 20:10:42 PST 2026
https://github.com/PrasoonMishra updated https://github.com/llvm/llvm-project/pull/176895
>From 35f1282aeb9f337aa742106e0e1d041d8fd789f9 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Tue, 20 Jan 2026 08:23:18 +0000
Subject: [PATCH 1/7] [AMDGPU] Hoist WMMA coexecution hazard V_NOPs from loops
to preheaders
On GFX1250, V_NOPs inserted for WMMA coexecution hazards are placed at
the use-site. When the hazard-consuming instruction is inside a loop and
the WMMA is outside, these NOPs execute every iteration even though the
hazard only needs to be covered once.
This patch hoists the V_NOPs to the loop preheader, reducing executions
from N iterations to 1.
Example (assuming a hazard requiring K V_NOPs):
Before:
bb.0 (preheader): WMMA writes vgpr0
bb.1 (loop): V_NOP xK, VALU reads vgpr0, branch bb.1
-> K NOPs executed per iteration
After:
bb.0 (preheader): WMMA writes vgpr0, V_NOP xK
bb.1 (loop): VALU reads vgpr0, branch bb.1
-> K NOPs executed once
For nested loops, V_NOPs are hoisted to the outermost preheader where no
conflicting WMMA exists within the loop.
Hoisting is restricted to strict preheaders (not any single predecessor)
to avoid introducing V_NOPs on unrelated control flow paths.
MachineLoopInfo is computed lazily within GCNHazardRecognizer.
The optimization is controlled by -amdgpu-wmma-vnop-hoisting (default: on).
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 222 +++++++--
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 21 +
.../test/CodeGen/AMDGPU/wmma-nop-hoisting.mir | 456 ++++++++++++++++--
3 files changed, 620 insertions(+), 79 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index d504d8618b90d..5ba511d3d5e0f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,14 +14,23 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
+#define DEBUG_TYPE "gcn-hazard-recognizer"
+
+STATISTIC(NumWMMANopsHoisted,
+ "Number of WMMA hazard V_NOPs hoisted from loops");
+STATISTIC(NumWMMAHoistingBailed,
+ "Number of WMMA hazards where V_NOP hoisting was not possible");
+
namespace {
struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
@@ -50,6 +59,10 @@ static cl::opt<unsigned>
NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
cl::desc("Insert a s_nop x before every instruction"));
+static cl::opt<bool> EnableWMMAVnopHoisting(
+ "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
+ cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
+
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
@@ -1288,7 +1301,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI); // fall-through if co-execution is enabled.
- emitVNops(MI, checkWMMACoexecutionHazards(MI));
+ fixWMMACoexecutionHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -2084,8 +2097,6 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
return 0;
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
// WaitStates here is the number of V_NOPs or unrelated VALU instructions must
// be in between the first WMMA and the second instruction to cover the hazard
// (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
@@ -2095,7 +2106,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
const int VALUWaitStates[] = {4, 8, 2, 4};
unsigned Category = 0;
- auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
if (!TII->isXDLWMMA(I))
return false;
@@ -2103,24 +2114,10 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
return false;
- Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
- Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
- Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
-
- // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
- if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
- return true;
-
- if (SIInstrInfo::isSWMMAC(*MI)) {
- Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
- if (TRI->regsOverlap(D0, Idx1))
- return true;
- }
-
- return false;
+ return hasWMMAToWMMARegOverlap(I, *MI);
};
- auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
if (!TII->isXDLWMMA(I))
return false;
@@ -2128,35 +2125,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
return false;
- // WMMA writes, VALU reads.
- Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
- for (const MachineOperand &ValuUse : MI->explicit_uses()) {
- if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
- return true;
- }
-
- auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
- if (!ValuDst || !ValuDst->isReg())
- return false;
- Register D1 = ValuDst->getReg();
-
- // WMMA writes, VALU writes.
- if (TRI->regsOverlap(D0, D1))
- return true;
-
- // WMMA reads, VALU writes.
- Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
- Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
- if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
- return true;
-
- if (SIInstrInfo::isSWMMAC(I)) {
- Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
- if (TRI->regsOverlap(D1, Idx0))
- return true;
- }
-
- return false;
+ return hasWMMAToVALURegOverlap(I, *MI);
};
int Limit = 0;
@@ -2191,6 +2160,161 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
return WaitStatesNeeded;
}
+void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
+ int Count) {
+ MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
+ const DebugLoc &DL =
+ InsertPt != MBB->end() ? InsertPt->getDebugLoc() : DebugLoc();
+
+ for (int i = 0; i < Count; ++i) {
+ BuildMI(*MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
+ }
+}
+
+bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
+ const MachineInstr &WMMA, const MachineInstr &MI) const {
+ Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
+ Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+ Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
+
+ // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
+ if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(MI)) {
+ Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI.regsOverlap(D0, Idx1))
+ return true;
+ }
+ return false;
+}
+
+bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
+ const MachineInstr &WMMA, const MachineInstr &MI) const {
+ // WMMA writes, VALU reads.
+ Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI.explicit_uses()) {
+ if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+ Register D1 = ValuDst->getReg();
+
+ // WMMA writes, VALU writes.
+ if (TRI.regsOverlap(D0, D1))
+ return true;
+
+ // WMMA reads, VALU writes.
+ Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
+ Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
+ if (TRI.regsOverlap(A0, D1) || TRI.regsOverlap(B0, D1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(WMMA)) {
+ Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
+ if (TRI.regsOverlap(D1, Idx0))
+ return true;
+ }
+ return false;
+}
+
+bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
+ const MachineInstr &MI) const {
+ if (!TII.isXDLWMMA(I))
+ return false;
+
+ // Dispatch based on MI type
+ if (TII.isXDLWMMA(MI))
+ return hasWMMAToWMMARegOverlap(I, MI);
+ else if (isCoexecutableVALUInst(MI))
+ return hasWMMAToVALURegOverlap(I, MI);
+
+ return false;
+}
+
+bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
+ bool IncludeSubloops) {
+ // Scan loop for any WMMA that hazards MI.
+ // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
+ for (MachineBasicBlock *MBB : L->getBlocks()) {
+ if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
+ continue;
+ for (MachineInstr &I : *MBB) {
+ if (&I == MI)
+ continue;
+ if (isCoexecutionHazardFor(I, *MI))
+ return true;
+ }
+ }
+ return false;
+}
+
+void GCNHazardRecognizer::ensureLoopInfoAvailable() {
+ // Lazily compute MDT and MLI only when needed
+ if (MLI)
+ return;
+
+ OwnedMDT =
+ std::make_unique<MachineDominatorTree>(const_cast<MachineFunction &>(MF));
+ OwnedMLI = std::make_unique<MachineLoopInfo>();
+ OwnedMLI->analyze(*OwnedMDT);
+
+ MLI = OwnedMLI.get();
+}
+
+bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
+ int WaitStatesNeeded) {
+ ensureLoopInfoAvailable();
+
+ MachineLoop *L = MLI->getLoopFor(MI->getParent());
+ if (!L) {
+ ++NumWMMAHoistingBailed;
+ return false;
+ }
+
+ // If innermost loop has WMMA hazard, we can't hoist at all
+ if (hasWMMAHazardInLoop(L, MI)) {
+ ++NumWMMAHoistingBailed;
+ return false;
+ }
+
+ // Find outermost loop with no internal hazard
+ MachineLoop *TargetLoop = L;
+ while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
+ if (hasWMMAHazardInLoop(Parent, MI, false))
+ break; // Parent has hazard in its own blocks, stop here
+ TargetLoop = Parent; // Safe to hoist further out
+ }
+
+ // Need valid preheader to insert V_NOPs
+ MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
+ if (!Preheader) {
+ ++NumWMMAHoistingBailed;
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
+ << " V_NOPs from loop to " << Preheader->getName() << "\n");
+
+ insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
+ NumWMMANopsHoisted += WaitStatesNeeded;
+ return true;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+ int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
+ if (WaitStatesNeeded <= 0)
+ return false;
+
+ if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
+ return true;
+
+ return emitVNops(MI, WaitStatesNeeded);
+}
+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index d725134639cfe..0a66b4206ce4a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -15,9 +15,12 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include <list>
+#include <memory>
namespace llvm {
@@ -49,6 +52,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const TargetSchedModel &TSchedModel;
+
+ // Loop info for V_NOP hoisting, computed on demand only when needed.
+ std::unique_ptr<MachineDominatorTree> OwnedMDT;
+ std::unique_ptr<MachineLoopInfo> OwnedMLI;
+ MachineLoopInfo *MLI = nullptr;
+
bool RunLdsBranchVmemWARHazardFixup;
/// RegUnits of uses in the current soft memory clause.
@@ -114,6 +123,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
int checkWMMACoexecutionHazards(MachineInstr *MI);
+ bool fixWMMACoexecutionHazards(MachineInstr *MI);
+ void ensureLoopInfoAvailable();
+ bool tryHoistWMMAVnopsFromLoop(MachineInstr *MI, int WaitStatesNeeded);
+ bool hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
+ bool IncludeSubloops = true);
+ bool hasWMMAToWMMARegOverlap(const MachineInstr &WMMA,
+ const MachineInstr &MI) const;
+ bool hasWMMAToVALURegOverlap(const MachineInstr &WMMA,
+ const MachineInstr &MI) const;
+ bool isCoexecutionHazardFor(const MachineInstr &I,
+ const MachineInstr &MI) const;
+ void insertVnopsBeforeTerminator(MachineBasicBlock *MBB, int Count);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index 283b1a352d1d6..3feabeb8986ce 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -1,38 +1,212 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec -amdgpu-wmma-vnop-hoisting=false %s -o - | FileCheck -check-prefix=NOHOIST %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=HOIST %s
-# Test: WMMA outside both loops, VALU in inner loop
-# Currently NOPs are inserted inside the loop body (bb.2).
-# A future optimization could hoist these NOPs to the preheader (bb.0).
+# Test 1: WMMA outside loop, VALU inside loop
+# The NOPs should be hoisted from the loop body to the preheader
+---
+name: test_simple_loop_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_simple_loop_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_simple_loop_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.1
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+...
+
+# Test 2: WMMA hazard INSIDE the loop; should NOT hoist
+---
+name: test_internal_hazard_no_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_internal_hazard_no_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_internal_hazard_no_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.1
+ ; WMMA inside the loop writes to vgpr16-23, VALU reads vgpr16
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+...
+
+# Test 3: WMMA in loop but no hazard
+---
+name: test_wmma_in_loop_no_conflict_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_wmma_in_loop_no_conflict_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_wmma_in_loop_no_conflict_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ ; External WMMA writes to vgpr16-23
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.1
+ ; Loop WMMA writes to vgpr56-63 (different registers)
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+ ; This reads vgpr16 from the external WMMA
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+...
+
+# Test 4: WMMA outside both loops, VALU in inner loop
+# NOPs should be hoisted to the outermost preheader (bb.0)
---
name: test_nested_loop_hoist_to_outermost
body: |
- ; CHECK-LABEL: name: test_nested_loop_hoist_to_outermost
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: V_NOP_e32 implicit $exec
- ; CHECK-NEXT: V_NOP_e32 implicit $exec
- ; CHECK-NEXT: V_NOP_e32 implicit $exec
- ; CHECK-NEXT: V_NOP_e32 implicit $exec
- ; CHECK-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.3:
- ; CHECK-NEXT: S_ENDPGM 0
+ ; NOHOIST-LABEL: name: test_nested_loop_hoist_to_outermost
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ ; NOHOIST-NEXT: S_BRANCH %bb.2
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.2:
+ ; NOHOIST-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.2
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.3:
+ ; NOHOIST-NEXT: S_ENDPGM 0
+ ;
+ ; HOIST-LABEL: name: test_nested_loop_hoist_to_outermost
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ ; HOIST-NEXT: S_BRANCH %bb.2
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.2:
+ ; HOIST-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.2
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.3:
+ ; HOIST-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
; WMMA outside all loops - writes to vgpr16-23
@@ -54,3 +228,225 @@ body: |
; Exit block
S_ENDPGM 0
...
+
+# Test 5: Triple nested loop - WMMA hazard in outer loop (L1)
+# VALU in innermost loop (L3) reads from WMMA in L1's body
+# NOPs should be hoisted to L2's preheader (bb.1)
+---
+name: test_triple_nested_hoist_to_intermediate
+body: |
+ ; NOHOIST-LABEL: name: test_triple_nested_hoist_to_intermediate
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.2(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.2
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.2:
+ ; NOHOIST-NEXT: successors: %bb.3(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_BRANCH %bb.3
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.3:
+ ; NOHOIST-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.3
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.4:
+ ; NOHOIST-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_CBRANCH_SCC1 %bb.5, implicit undef $scc
+ ; NOHOIST-NEXT: S_BRANCH %bb.2
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.5:
+ ; NOHOIST-NEXT: successors: %bb.1(0x40000000), %bb.6(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.6:
+ ; NOHOIST-NEXT: S_ENDPGM 0
+ ;
+ ; HOIST-LABEL: name: test_triple_nested_hoist_to_intermediate
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.2(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.2
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.2:
+ ; HOIST-NEXT: successors: %bb.3(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_BRANCH %bb.3
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.3:
+ ; HOIST-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.3
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.4:
+ ; HOIST-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_CBRANCH_SCC1 %bb.5, implicit undef $scc
+ ; HOIST-NEXT: S_BRANCH %bb.2
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.5:
+ ; HOIST-NEXT: successors: %bb.1(0x40000000), %bb.6(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.6:
+ ; HOIST-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.2
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+ bb.2:
+ successors: %bb.3
+ S_BRANCH %bb.3
+ bb.3:
+ ; VALU reads vgpr16 from WMMA in bb.1
+ successors: %bb.3, %bb.4
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_CBRANCH_EXECZ %bb.4, implicit $exec
+ S_BRANCH %bb.3
+ bb.4:
+ ; L2 latch - back to L2 header or exit to L1 latch
+ successors: %bb.2, %bb.5
+ S_CBRANCH_SCC1 %bb.5, implicit undef $scc
+ S_BRANCH %bb.2
+ bb.5:
+ ; L1 latch - back to L1 header or exit
+ successors: %bb.1, %bb.6
+ S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+ S_BRANCH %bb.1
+ bb.6:
+ ; Exit
+ S_ENDPGM 0
+...
+
+# Test 6: No preheader (multiple predecessors) - cannot hoist
+---
+name: test_no_preheader_no_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_no_preheader_no_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.2(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_BRANCH %bb.2
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.2:
+ ; NOHOIST-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.2
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.3:
+ ; NOHOIST-NEXT: S_ENDPGM 0
+ ;
+ ; HOIST-LABEL: name: test_no_preheader_no_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.2(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_BRANCH %bb.2
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.2:
+ ; HOIST-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.2
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.3:
+ ; HOIST-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.2
+ S_BRANCH %bb.2
+ bb.2:
+ ; Loop header with two predecessors (bb.0 and bb.1)
+ successors: %bb.2, %bb.3
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_CBRANCH_EXECZ %bb.3, implicit $exec
+ S_BRANCH %bb.2
+ bb.3:
+ S_ENDPGM 0
+...
+
+# Test 7: Not in a loop; should NOT hoist
+---
+name: test_not_in_loop_no_hoist
+body: |
+ bb.0:
+ ; NOHOIST-LABEL: name: test_not_in_loop_no_hoist
+ ; NOHOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ;
+ ; HOIST-LABEL: name: test_not_in_loop_no_hoist
+ ; HOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
>From 95f7b772cf3a81f79876a4c1913dd5900466b511 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Tue, 20 Jan 2026 16:04:05 +0000
Subject: [PATCH 2/7] Fix: - Drop DebugLoc from hositd V_Nops - Use
printMBBReference
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5ba511d3d5e0f..1680286e66612 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2163,11 +2163,8 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
int Count) {
MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
- const DebugLoc &DL =
- InsertPt != MBB->end() ? InsertPt->getDebugLoc() : DebugLoc();
-
for (int i = 0; i < Count; ++i) {
- BuildMI(*MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
+ BuildMI(*MBB, InsertPt, DebugLoc(), TII.get(AMDGPU::V_NOP_e32));
}
}
@@ -2297,7 +2294,8 @@ bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
}
LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
- << " V_NOPs from loop to " << Preheader->getName() << "\n");
+ << " V_NOPs from loop to " << printMBBReference(*Preheader)
+ << "\n");
insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
NumWMMANopsHoisted += WaitStatesNeeded;
>From e1c6bcd30b0306eeec0f612f20dd212c752ef463 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Wed, 21 Jan 2026 03:56:03 +0000
Subject: [PATCH 3/7] Used existing emitVnops and added a test for preheader
without terminator.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 30 ++++--------
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 8 ++--
.../test/CodeGen/AMDGPU/wmma-nop-hoisting.mir | 46 ++++++++++++++++++-
3 files changed, 58 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 1680286e66612..bf60330aa0ad8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1273,18 +1273,12 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
}
-// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
-// to insert, negative means not needed.
-bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
- if (WaitStatesNeeded <= 0)
- return false;
-
- const SIInstrInfo *TII = ST.getInstrInfo();
+void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ int WaitStatesNeeded, bool IsHoisting) {
+ const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
for (int I = 0; I < WaitStatesNeeded; ++I)
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_NOP_e32));
-
- return true;
+ BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
}
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
@@ -2160,14 +2154,6 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
return WaitStatesNeeded;
}
-void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
- int Count) {
- MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
- for (int i = 0; i < Count; ++i) {
- BuildMI(*MBB, InsertPt, DebugLoc(), TII.get(AMDGPU::V_NOP_e32));
- }
-}
-
bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
const MachineInstr &WMMA, const MachineInstr &MI) const {
Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
@@ -2297,7 +2283,8 @@ bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
<< " V_NOPs from loop to " << printMBBReference(*Preheader)
<< "\n");
- insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
+ emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
+ /*IsHoisting=*/true);
NumWMMANopsHoisted += WaitStatesNeeded;
return true;
}
@@ -2310,7 +2297,8 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
return true;
- return emitVNops(MI, WaitStatesNeeded);
+ emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
+ return true;
}
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 0a66b4206ce4a..87a9aba69ec32 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -107,9 +107,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
- // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we
- // need to insert, negative means not needed.
- bool emitVNops(MachineInstr *MI, int WaitStatesNeeded);
+ // Emit \p WaitStatesNeeded V_NOP instructions before \p InsertPt.
+ // If IsHoisting is true, uses empty DebugLoc for compiler-inserted NOPs.
+ void emitVNops(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ int WaitStatesNeeded, bool IsHoisting = false);
void fixHazards(MachineInstr *MI);
bool fixVcmpxPermlaneHazards(MachineInstr *MI);
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
@@ -134,7 +135,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const MachineInstr &MI) const;
bool isCoexecutionHazardFor(const MachineInstr &I,
const MachineInstr &MI) const;
- void insertVnopsBeforeTerminator(MachineBasicBlock *MBB, int Count);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index 3feabeb8986ce..e0ebbf0b53d9f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -426,7 +426,51 @@ body: |
S_ENDPGM 0
...
-# Test 7: Not in a loop; should NOT hoist
+# Test 7: Preheader without terminator
+---
+name: test_fallthrough_preheader_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_fallthrough_preheader_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_fallthrough_preheader_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ bb.1:
+ successors: %bb.1
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+...
+
+# Test 8: Not in a loop; should NOT hoist
---
name: test_not_in_loop_no_hoist
body: |
>From 58c4ab7d0eb97756c0d49fb42fc4253de19e067d Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Wed, 21 Jan 2026 09:58:38 +0000
Subject: [PATCH 4/7] Take MLI from Pass manager and small nitfix
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 4 ++-
llvm/lib/CodeGen/PostRAHazardRecognizer.cpp | 21 ++++++++++------
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 25 ++++++-------------
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 8 +++---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 5 ++--
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 ++-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 +++++++
7 files changed, 42 insertions(+), 33 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 45713360d44de..ed968d7bd4593 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -47,6 +47,7 @@ class InstrItineraryData;
class LiveIntervals;
class LiveVariables;
class MachineLoop;
+class MachineLoopInfo;
class MachineMemOperand;
class MachineModuleInfo;
class MachineRegisterInfo;
@@ -1785,7 +1786,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
/// Allocate and return a hazard recognizer to use for by non-scheduling
/// passes.
virtual ScheduleHazardRecognizer *
- CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+ CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
+ MachineLoopInfo *MLI) const {
return nullptr;
}
diff --git a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
index 29cfc06d90b29..906eea29f1a15 100644
--- a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -42,7 +43,7 @@ STATISTIC(NumNoops, "Number of noops inserted");
namespace {
struct PostRAHazardRecognizer {
- bool run(MachineFunction &MF);
+ bool run(MachineFunction &MF, MachineLoopInfo *MLI);
};
class PostRAHazardRecognizerLegacy : public MachineFunctionPass {
@@ -53,11 +54,13 @@ class PostRAHazardRecognizerLegacy : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool runOnMachineFunction(MachineFunction &Fn) override {
- return PostRAHazardRecognizer().run(Fn);
+ MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ return PostRAHazardRecognizer().run(Fn, &MLI);
}
};
char PostRAHazardRecognizerLegacy::ID = 0;
@@ -66,13 +69,17 @@ char PostRAHazardRecognizerLegacy::ID = 0;
char &llvm::PostRAHazardRecognizerID = PostRAHazardRecognizerLegacy::ID;
-INITIALIZE_PASS(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
- "Post RA hazard recognizer", false, false)
+INITIALIZE_PASS_BEGIN(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
+ "Post RA hazard recognizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_END(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
+ "Post RA hazard recognizer", false, false)
PreservedAnalyses
llvm::PostRAHazardRecognizerPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
- if (!PostRAHazardRecognizer().run(MF))
+ MachineLoopInfo *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+ if (!PostRAHazardRecognizer().run(MF, MLI))
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
@@ -80,10 +87,10 @@ llvm::PostRAHazardRecognizerPass::run(MachineFunction &MF,
return PA;
}
-bool PostRAHazardRecognizer::run(MachineFunction &Fn) {
+bool PostRAHazardRecognizer::run(MachineFunction &Fn, MachineLoopInfo *MLI) {
const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
std::unique_ptr<ScheduleHazardRecognizer> HazardRec(
- TII->CreateTargetPostRAHazardRecognizer(Fn));
+ TII->CreateTargetPostRAHazardRecognizer(Fn, MLI));
// Return if the target has not implemented a hazard recognizer.
if (!HazardRec)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bf60330aa0ad8..57d60e0c21dee 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -70,10 +70,11 @@ static cl::opt<bool> EnableWMMAVnopHoisting(
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
const GCNSubtarget &ST);
-GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF,
+ MachineLoopInfo *MLI)
: IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
- TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
+ TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -2206,13 +2207,15 @@ bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
const MachineInstr &MI) const {
+ // I is the potential WMMA hazard source, MI is the instruction being checked
+ // for hazard.
if (!TII.isXDLWMMA(I))
return false;
// Dispatch based on MI type
if (TII.isXDLWMMA(MI))
return hasWMMAToWMMARegOverlap(I, MI);
- else if (isCoexecutableVALUInst(MI))
+ if (isCoexecutableVALUInst(MI))
return hasWMMAToVALURegOverlap(I, MI);
return false;
@@ -2235,22 +2238,10 @@ bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
return false;
}
-void GCNHazardRecognizer::ensureLoopInfoAvailable() {
- // Lazily compute MDT and MLI only when needed
- if (MLI)
- return;
-
- OwnedMDT =
- std::make_unique<MachineDominatorTree>(const_cast<MachineFunction &>(MF));
- OwnedMLI = std::make_unique<MachineLoopInfo>();
- OwnedMLI->analyze(*OwnedMDT);
-
- MLI = OwnedMLI.get();
-}
-
bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
int WaitStatesNeeded) {
- ensureLoopInfoAvailable();
+ if (!MLI)
+ return false;
MachineLoop *L = MLI->getLoopFor(MI->getParent());
if (!L) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 87a9aba69ec32..3f8c43673a3ba 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -53,9 +53,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const SIRegisterInfo &TRI;
const TargetSchedModel &TSchedModel;
- // Loop info for V_NOP hoisting, computed on demand only when needed.
- std::unique_ptr<MachineDominatorTree> OwnedMDT;
- std::unique_ptr<MachineLoopInfo> OwnedMLI;
+ // Loop info for V_NOP hoisting, passed from the pass manager.
MachineLoopInfo *MLI = nullptr;
bool RunLdsBranchVmemWARHazardFixup;
@@ -125,7 +123,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixWMMAHazards(MachineInstr *MI);
int checkWMMACoexecutionHazards(MachineInstr *MI);
bool fixWMMACoexecutionHazards(MachineInstr *MI);
- void ensureLoopInfoAvailable();
bool tryHoistWMMAVnopsFromLoop(MachineInstr *MI, int WaitStatesNeeded);
bool hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
bool IncludeSubloops = true);
@@ -167,7 +164,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkPermlaneHazards(MachineInstr *MI);
public:
- GCNHazardRecognizer(const MachineFunction &MF);
+ GCNHazardRecognizer(const MachineFunction &MF,
+ MachineLoopInfo *MLI = nullptr);
// We can only issue one instruction per cycle.
bool atIssueLimit() const override { return true; }
void EmitInstruction(SUnit *SU) override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9180d5fc8bcf0..426f97bfff676 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9911,8 +9911,9 @@ SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
/// pass.
ScheduleHazardRecognizer *
-SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
- return new GCNHazardRecognizer(MF);
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
+ MachineLoopInfo *MLI) const {
+ return new GCNHazardRecognizer(MF, MLI);
}
// Called during:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0b54513bb6114..d7c149ad0c355 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1582,7 +1582,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const ScheduleDAG *DAG) const override;
ScheduleHazardRecognizer *
- CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+ CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
+ MachineLoopInfo *MLI) const override;
ScheduleHazardRecognizer *
CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 2904ba604fb1b..0e7caad5786d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,6 +146,7 @@
; GCN-O0-NEXT: SI insert wait instructions
; GCN-O0-NEXT: Insert required mode register values
; GCN-O0-NEXT: SI Final Branch Preparation
+; GCN-O0-NEXT: Machine Natural Loop Construction
; GCN-O0-NEXT: Post RA hazard recognizer
; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O0-NEXT: AMDGPU Lower VGPR Encoding
@@ -438,6 +439,8 @@
; GCN-O1-NEXT: SI Insert Hard Clauses
; GCN-O1-NEXT: SI Final Branch Preparation
; GCN-O1-NEXT: SI peephole optimizations
+; GCN-O1-NEXT: MachineDominator Tree Construction
+; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Post RA hazard recognizer
; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O1-NEXT: AMDGPU Lower VGPR Encoding
@@ -758,6 +761,8 @@
; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
; GCN-O1-OPTS-NEXT: SI peephole optimizations
+; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
+; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O1-OPTS-NEXT: AMDGPU Lower VGPR Encoding
@@ -1083,6 +1088,8 @@
; GCN-O2-NEXT: SI Insert Hard Clauses
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
+; GCN-O2-NEXT: MachineDominator Tree Construction
+; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O2-NEXT: AMDGPU Lower VGPR Encoding
@@ -1421,6 +1428,8 @@
; GCN-O3-NEXT: SI Insert Hard Clauses
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
+; GCN-O3-NEXT: MachineDominator Tree Construction
+; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O3-NEXT: AMDGPU Lower VGPR Encoding
>From 87d6471ef6ae6e75a1d20588af4acc97e181d09a Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <Prasoon.Mishra at amd.com>
Date: Wed, 21 Jan 2026 11:19:59 +0000
Subject: [PATCH 5/7] Remove memory & MachineDominators header as there is no
need now.
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 3f8c43673a3ba..b331504d40113 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -15,12 +15,10 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include <list>
-#include <memory>
namespace llvm {
>From 8255ab4787df91be737c66377ec1072df38815bc Mon Sep 17 00:00:00 2001
From: Dark Steve <Prasoon.Mishra at amd.com>
Date: Wed, 18 Feb 2026 06:38:43 +0000
Subject: [PATCH 6/7] Reverted the changes of llc-pipeline as now late
branching and peephole preserve MLI.
---
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0e7caad5786d6..2904ba604fb1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,7 +146,6 @@
; GCN-O0-NEXT: SI insert wait instructions
; GCN-O0-NEXT: Insert required mode register values
; GCN-O0-NEXT: SI Final Branch Preparation
-; GCN-O0-NEXT: Machine Natural Loop Construction
; GCN-O0-NEXT: Post RA hazard recognizer
; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O0-NEXT: AMDGPU Lower VGPR Encoding
@@ -439,8 +438,6 @@
; GCN-O1-NEXT: SI Insert Hard Clauses
; GCN-O1-NEXT: SI Final Branch Preparation
; GCN-O1-NEXT: SI peephole optimizations
-; GCN-O1-NEXT: MachineDominator Tree Construction
-; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Post RA hazard recognizer
; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O1-NEXT: AMDGPU Lower VGPR Encoding
@@ -761,8 +758,6 @@
; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
; GCN-O1-OPTS-NEXT: SI peephole optimizations
-; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
-; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O1-OPTS-NEXT: AMDGPU Lower VGPR Encoding
@@ -1088,8 +1083,6 @@
; GCN-O2-NEXT: SI Insert Hard Clauses
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
-; GCN-O2-NEXT: MachineDominator Tree Construction
-; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O2-NEXT: AMDGPU Lower VGPR Encoding
@@ -1428,8 +1421,6 @@
; GCN-O3-NEXT: SI Insert Hard Clauses
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
-; GCN-O3-NEXT: MachineDominator Tree Construction
-; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards
; GCN-O3-NEXT: AMDGPU Lower VGPR Encoding
>From 3ca4039ebb29378a12798041e4225a08fb32f282 Mon Sep 17 00:00:00 2001
From: Dark Steve <Prasoon.Mishra at amd.com>
Date: Wed, 25 Feb 2026 04:06:25 +0000
Subject: [PATCH 7/7] Added test where hazard is via back-edge
---
.../test/CodeGen/AMDGPU/wmma-nop-hoisting.mir | 46 +++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index e0ebbf0b53d9f..9918c9077d1b7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -494,3 +494,49 @@ body: |
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
...
+# Test 9: VALU first, then WMMA in same loop (hazard via back-edge)
+---
+name: test_valu_before_wmma_backedge_no_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_valu_before_wmma_backedge_no_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_valu_before_wmma_backedge_no_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.1
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+...
More information about the llvm-commits
mailing list