[llvm] [AMDGPU] Hoist WMMA coexecution hazard V_NOPs from loops to preheaders (PR #176895)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 20 03:17:46 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Dark Steve (PrasoonMishra)
<details>
<summary>Changes</summary>
On GFX1250, V_NOPs inserted for WMMA coexecution hazards are placed at the use-site. When the hazard-consuming instruction is inside a loop and the WMMA is outside, these NOPs execute every iteration even though the hazard only needs to be covered once.
This patch hoists the V_NOPs to the loop preheader, reducing executions from N iterations to 1.
```
Example (assuming a hazard requiring K V_NOPs):
Before:
bb.0 (preheader): WMMA writes vgpr0
bb.1 (loop): V_NOP xK, VALU reads vgpr0, branch bb.1
-> K NOPs executed per iteration
After:
bb.0 (preheader): WMMA writes vgpr0, V_NOP xK
bb.1 (loop): VALU reads vgpr0, branch bb.1
-> K NOPs executed once
```
For nested loops, V_NOPs are hoisted to the outermost preheader where no WMMA hazard exists within the loop.
Hoisting is restricted to strict preheaders (not any single predecessor) to avoid introducing V_NOPs on unrelated control flow paths.
MachineLoopInfo is computed lazily within GCNHazardRecognizer.
The optimization is controlled by `-amdgpu-wmma-vnop-hoisting` (default: on).
---
Patch is 36.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176895.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+173-49)
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h (+21)
- (modified) llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir (+426-30)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 9838f1b1ef32a..d74d8c5dfa534 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,13 +14,22 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
+#define DEBUG_TYPE "gcn-hazard-recognizer"
+
+STATISTIC(NumWMMANopsHoisted,
+ "Number of WMMA hazard V_NOPs hoisted from loops");
+STATISTIC(NumWMMAHoistingBailed,
+ "Number of WMMA hazards where V_NOP hoisting was not possible");
+
namespace {
struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
@@ -49,6 +58,10 @@ static cl::opt<unsigned>
NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
cl::desc("Insert a s_nop x before every instruction"));
+static cl::opt<bool> EnableWMMAVnopHoisting(
+ "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
+ cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
+
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
@@ -1287,7 +1300,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI); // fall-through if co-execution is enabled.
- emitVNops(MI, checkWMMACoexecutionHazards(MI));
+ fixWMMACoexecutionHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -2083,8 +2096,6 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
return 0;
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
// WaitStates here is the number of V_NOPs or unrelated VALU instructions must
// be in between the first WMMA and the second instruction to cover the hazard
// (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
@@ -2094,7 +2105,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
const int VALUWaitStates[] = {4, 8, 2, 4};
unsigned Category = 0;
- auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
if (!TII->isXDLWMMA(I))
return false;
@@ -2102,24 +2113,10 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
return false;
- Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
- Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
- Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
-
- // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
- if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
- return true;
-
- if (SIInstrInfo::isSWMMAC(*MI)) {
- Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
- if (TRI->regsOverlap(D0, Idx1))
- return true;
- }
-
- return false;
+ return hasWMMAToWMMARegOverlap(I, *MI);
};
- auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
if (!TII->isXDLWMMA(I))
return false;
@@ -2127,35 +2124,7 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
return false;
- // WMMA writes, VALU reads.
- Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
- for (const MachineOperand &ValuUse : MI->explicit_uses()) {
- if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
- return true;
- }
-
- auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
- if (!ValuDst || !ValuDst->isReg())
- return false;
- Register D1 = ValuDst->getReg();
-
- // WMMA writes, VALU writes.
- if (TRI->regsOverlap(D0, D1))
- return true;
-
- // WMMA reads, VALU writes.
- Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
- Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
- if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
- return true;
-
- if (SIInstrInfo::isSWMMAC(I)) {
- Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
- if (TRI->regsOverlap(D1, Idx0))
- return true;
- }
-
- return false;
+ return hasWMMAToVALURegOverlap(I, *MI);
};
int Limit = 0;
@@ -2190,6 +2159,161 @@ int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
return WaitStatesNeeded;
}
+void GCNHazardRecognizer::insertVnopsBeforeTerminator(MachineBasicBlock *MBB,
+ int Count) {
+ MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
+ const DebugLoc &DL =
+ InsertPt != MBB->end() ? InsertPt->getDebugLoc() : DebugLoc();
+
+ for (int i = 0; i < Count; ++i) {
+ BuildMI(*MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
+ }
+}
+
+bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
+ const MachineInstr &WMMA, const MachineInstr &MI) const {
+ Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
+ Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+ Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
+
+ // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
+ if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(MI)) {
+ Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI.regsOverlap(D0, Idx1))
+ return true;
+ }
+ return false;
+}
+
+bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
+ const MachineInstr &WMMA, const MachineInstr &MI) const {
+ // WMMA writes, VALU reads.
+ Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI.explicit_uses()) {
+ if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+ Register D1 = ValuDst->getReg();
+
+ // WMMA writes, VALU writes.
+ if (TRI.regsOverlap(D0, D1))
+ return true;
+
+ // WMMA reads, VALU writes.
+ Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
+ Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
+ if (TRI.regsOverlap(A0, D1) || TRI.regsOverlap(B0, D1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(WMMA)) {
+ Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
+ if (TRI.regsOverlap(D1, Idx0))
+ return true;
+ }
+ return false;
+}
+
+bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
+ const MachineInstr &MI) const {
+ if (!TII.isXDLWMMA(I))
+ return false;
+
+ // Dispatch based on MI type
+ if (TII.isXDLWMMA(MI))
+ return hasWMMAToWMMARegOverlap(I, MI);
+ else if (isCoexecutableVALUInst(MI))
+ return hasWMMAToVALURegOverlap(I, MI);
+
+ return false;
+}
+
+bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
+ bool IncludeSubloops) {
+ // Scan loop for any WMMA that hazards MI.
+ // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
+ for (MachineBasicBlock *MBB : L->getBlocks()) {
+ if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
+ continue;
+ for (MachineInstr &I : *MBB) {
+ if (&I == MI)
+ continue;
+ if (isCoexecutionHazardFor(I, *MI))
+ return true;
+ }
+ }
+ return false;
+}
+
+void GCNHazardRecognizer::ensureLoopInfoAvailable() {
+ // Lazily compute MDT and MLI only when needed
+ if (MLI)
+ return;
+
+ OwnedMDT =
+ std::make_unique<MachineDominatorTree>(const_cast<MachineFunction &>(MF));
+ OwnedMLI = std::make_unique<MachineLoopInfo>();
+ OwnedMLI->analyze(*OwnedMDT);
+
+ MLI = OwnedMLI.get();
+}
+
+bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
+ int WaitStatesNeeded) {
+ ensureLoopInfoAvailable();
+
+ MachineLoop *L = MLI->getLoopFor(MI->getParent());
+ if (!L) {
+ ++NumWMMAHoistingBailed;
+ return false;
+ }
+
+ // If innermost loop has WMMA hazard, we can't hoist at all
+ if (hasWMMAHazardInLoop(L, MI)) {
+ ++NumWMMAHoistingBailed;
+ return false;
+ }
+
+ // Find outermost loop with no internal hazard
+ MachineLoop *TargetLoop = L;
+ while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
+ if (hasWMMAHazardInLoop(Parent, MI, false))
+ break; // Parent has hazard in its own blocks, stop here
+ TargetLoop = Parent; // Safe to hoist further out
+ }
+
+ // Need valid preheader to insert V_NOPs
+ MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
+ if (!Preheader) {
+ ++NumWMMAHoistingBailed;
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
+ << " V_NOPs from loop to " << Preheader->getName() << "\n");
+
+ insertVnopsBeforeTerminator(Preheader, WaitStatesNeeded);
+ NumWMMANopsHoisted += WaitStatesNeeded;
+ return true;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+ int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
+ if (WaitStatesNeeded <= 0)
+ return false;
+
+ if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
+ return true;
+
+ return emitVNops(MI, WaitStatesNeeded);
+}
+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index d725134639cfe..0a66b4206ce4a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -15,9 +15,12 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include <list>
+#include <memory>
namespace llvm {
@@ -49,6 +52,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const TargetSchedModel &TSchedModel;
+
+ // Loop info for V_NOP hoisting, computed on demand only when needed.
+ std::unique_ptr<MachineDominatorTree> OwnedMDT;
+ std::unique_ptr<MachineLoopInfo> OwnedMLI;
+ MachineLoopInfo *MLI = nullptr;
+
bool RunLdsBranchVmemWARHazardFixup;
/// RegUnits of uses in the current soft memory clause.
@@ -114,6 +123,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
int checkWMMACoexecutionHazards(MachineInstr *MI);
+ bool fixWMMACoexecutionHazards(MachineInstr *MI);
+ void ensureLoopInfoAvailable();
+ bool tryHoistWMMAVnopsFromLoop(MachineInstr *MI, int WaitStatesNeeded);
+ bool hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
+ bool IncludeSubloops = true);
+ bool hasWMMAToWMMARegOverlap(const MachineInstr &WMMA,
+ const MachineInstr &MI) const;
+ bool hasWMMAToVALURegOverlap(const MachineInstr &WMMA,
+ const MachineInstr &MI) const;
+ bool isCoexecutionHazardFor(const MachineInstr &I,
+ const MachineInstr &MI) const;
+ void insertVnopsBeforeTerminator(MachineBasicBlock *MBB, int Count);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
index 283b1a352d1d6..3feabeb8986ce 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-nop-hoisting.mir
@@ -1,38 +1,212 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec -amdgpu-wmma-vnop-hoisting=false %s -o - | FileCheck -check-prefix=NOHOIST %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=HOIST %s
-# Test: WMMA outside both loops, VALU in inner loop
-# Currently NOPs are inserted inside the loop body (bb.2).
-# A future optimization could hoist these NOPs to the preheader (bb.0).
+# Test 1: WMMA outside loop, VALU inside loop
+# The NOPs should be hoisted from the loop body to the preheader
+---
+name: test_simple_loop_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_simple_loop_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_simple_loop_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.1
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+...
+
+# Test 2: WMMA hazard INSIDE the loop; should NOT hoist
+---
+name: test_internal_hazard_no_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_internal_hazard_no_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_internal_hazard_no_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: bb.1:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: V_NOP_e32 implicit $exec
+ ; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; HOIST-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.1
+ S_BRANCH %bb.1
+ bb.1:
+ successors: %bb.1
+ ; WMMA inside the loop writes to vgpr16-23, VALU reads vgpr16
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+...
+
+# Test 3: WMMA in loop but no hazard
+---
+name: test_wmma_in_loop_no_conflict_hoist
+body: |
+ ; NOHOIST-LABEL: name: test_wmma_in_loop_no_conflict_hoist
+ ; NOHOIST: bb.0:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: bb.1:
+ ; NOHOIST-NEXT: successors: %bb.1(0x80000000)
+ ; NOHOIST-NEXT: {{ $}}
+ ; NOHOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: V_NOP_e32 implicit $exec
+ ; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ ; NOHOIST-NEXT: S_BRANCH %bb.1
+ ;
+ ; HOIST-LABEL: name: test_wmma_in_loop_no_conflict_hoist
+ ; HOIST: bb.0:
+ ; HOIST-NEXT: successors: %bb.1(0x80000000)
+ ; HOIST-NEXT: {{ $}}
+ ; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgp...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/176895
More information about the llvm-commits
mailing list