[llvm] [AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling (PR #135090)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 9 15:11:17 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Robert Imschweiler (ro-i)
<details>
<summary>Changes</summary>
Fixes:
- detection of VMEM_READS which are FLAT loads.
- unsigned int underflows in `MFMASmallGemmSingleWaveOpt::applyIGLPStrategy`.
- resetting global static DSWCounters for new runs.
This LLVM defect was identified via the AMD Fuzzing project.
---
Full diff: https://github.com/llvm/llvm-project/pull/135090.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+18-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+22)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 7b4d00c8214cb..cea3bcf4b31df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -80,6 +80,10 @@ enum class SchedGroupMask {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
+static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
class SchedGroup;
// InstructionRule class is used to enact a filter which determines whether or
@@ -1891,7 +1895,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
}
- assert(Cache->size());
+ assert(!MFMAsFound || Cache->size());
auto *DAG = SyncPipe[0].DAG;
for (auto &Elt : *Cache) {
if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
@@ -1994,7 +1998,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
if (NumBits < 128) {
- assert(TII->isVMEM(*MI) && MI->mayLoad());
+ assert(handleAsVMEMInstr(*MI, TII) && MI->mayLoad());
if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
MRI, MI->getOperand(0))) <=
128)
@@ -2079,6 +2083,9 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
static unsigned DSWCount = 0;
static unsigned DSWWithPermCount = 0;
static unsigned DSWWithSharedVMEMCount = 0;
+static void resetDSWCounters() {
+ DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0;
+}
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -2138,7 +2145,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
for (auto &Succ : Pred.getSUnit()->Succs) {
auto *MI = Succ.getSUnit()->getInstr();
- if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ if (!handleAsVMEMInstr(*MI, TII) || !MI->mayLoad())
continue;
if (MissedAny || !VMEMLookup.size()) {
@@ -2200,7 +2207,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
// Interleave MFMA with DS_READ prefetch
- for (unsigned I = 0; I < DSRCount - 4; ++I) {
+ for (unsigned I = 4; I < DSRCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2213,7 +2220,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2a: Loop carried dependency with V_PERM
// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
// depend on. Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+ for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2250,7 +2257,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2b: Loop carried dependency without V_PERM
// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
// Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+ for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2426,17 +2433,15 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
- MI.mayLoad() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ MI.mayLoad() && handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
- MI.mayStore() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ MI.mayStore() && handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
@@ -2703,5 +2708,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
/// for a given region.
std::unique_ptr<ScheduleDAGMutation>
llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+ if (Phase == AMDGPU::SchedulingPhase::Initial)
+ resetDSWCounters();
return std::make_unique<IGroupLPDAGMutation>(Phase);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 08c0d15432915..3ce25c0fd1fef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -321,6 +321,28 @@ entry:
ret void
}
+; Check fixes for:
+; - detection of VMEM_READS which are FLAT loads.
+; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
+; - resetting global static DSWCounters for new runs.
+; (reduced fuzzer-generated test case)
+define amdgpu_kernel void @test_iglp_opt_flat_load(ptr %ptr1, ptr %ptr2, ptr addrspace(3) %ptr3, ptr addrspace(3) %ptr4) {
+entry:
+ %LGV2 = load <8 x half>, ptr %ptr1, align 16
+ %LGV = load i1, ptr %ptr2, align 1
+ call void @llvm.amdgcn.iglp.opt(i32 1)
+ %C = fcmp ugt <8 x half> zeroinitializer, %LGV2
+ store <8 x i1> %C, ptr addrspace(3) %ptr3, align 1
+ br i1 %LGV, label %common.ret, label %F
+
+common.ret: ; preds = %F, %entry
+ ret void
+
+F: ; preds = %entry
+ store <32 x float> zeroinitializer, ptr addrspace(3) %ptr4, align 128
+ br label %common.ret
+}
+
declare void @llvm.amdgcn.iglp.opt(i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
``````````
</details>
https://github.com/llvm/llvm-project/pull/135090
More information about the llvm-commits
mailing list