[llvm] [AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling (PR #135090)
Robert Imschweiler via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 10 04:59:19 PDT 2025
https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/135090
>From 480e54ca8d96983344a14145aa04bae0eae32cba Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 9 Apr 2025 16:20:01 -0500
Subject: [PATCH 1/2] [AMDGPU] IGLP: Fixes for VMEM load detection and unsigned
int handling
Fixes:
- detection of VMEM_READS which are FLAT loads.
- unsigned int underflows in
`MFMASmallGemmSingleWaveOpt::applyIGLPStrategy`.
- resetting global static DSWCounters for new runs.
This LLVM defect was identified via the AMD Fuzzing project.
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 29 ++++++++++++-------
.../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 22 ++++++++++++++
2 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 7b4d00c8214cb..cea3bcf4b31df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -80,6 +80,10 @@ enum class SchedGroupMask {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
+static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
class SchedGroup;
// InstructionRule class is used to enact a filter which determines whether or
@@ -1891,7 +1895,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
}
- assert(Cache->size());
+ assert(!MFMAsFound || Cache->size());
auto *DAG = SyncPipe[0].DAG;
for (auto &Elt : *Cache) {
if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
@@ -1994,7 +1998,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
if (NumBits < 128) {
- assert(TII->isVMEM(*MI) && MI->mayLoad());
+ assert(handleAsVMEMInstr(*MI, TII) && MI->mayLoad());
if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
MRI, MI->getOperand(0))) <=
128)
@@ -2079,6 +2083,9 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
static unsigned DSWCount = 0;
static unsigned DSWWithPermCount = 0;
static unsigned DSWWithSharedVMEMCount = 0;
+static void resetDSWCounters() {
+ DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0;
+}
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -2138,7 +2145,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
for (auto &Succ : Pred.getSUnit()->Succs) {
auto *MI = Succ.getSUnit()->getInstr();
- if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ if (!handleAsVMEMInstr(*MI, TII) || !MI->mayLoad())
continue;
if (MissedAny || !VMEMLookup.size()) {
@@ -2200,7 +2207,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
// Interleave MFMA with DS_READ prefetch
- for (unsigned I = 0; I < DSRCount - 4; ++I) {
+ for (unsigned I = 4; I < DSRCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2213,7 +2220,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2a: Loop carried dependency with V_PERM
// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
// depend on. Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+ for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2250,7 +2257,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2b: Loop carried dependency without V_PERM
// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
// Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+ for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2426,17 +2433,15 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
- MI.mayLoad() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ MI.mayLoad() && handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
- MI.mayStore() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ MI.mayStore() && handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
@@ -2703,5 +2708,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
/// for a given region.
std::unique_ptr<ScheduleDAGMutation>
llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+ if (Phase == AMDGPU::SchedulingPhase::Initial)
+ resetDSWCounters();
return std::make_unique<IGroupLPDAGMutation>(Phase);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 08c0d15432915..3ce25c0fd1fef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -321,6 +321,28 @@ entry:
ret void
}
+; Check fixes for:
+; - detection of VMEM_READS which are FLAT loads.
+; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
+; - resetting global static DSWCounters for new runs.
+; (reduced fuzzer-generated test case)
+define amdgpu_kernel void @test_iglp_opt_flat_load(ptr %ptr1, ptr %ptr2, ptr addrspace(3) %ptr3, ptr addrspace(3) %ptr4) {
+entry:
+ %LGV2 = load <8 x half>, ptr %ptr1, align 16
+ %LGV = load i1, ptr %ptr2, align 1
+ call void @llvm.amdgcn.iglp.opt(i32 1)
+ %C = fcmp ugt <8 x half> zeroinitializer, %LGV2
+ store <8 x i1> %C, ptr addrspace(3) %ptr3, align 1
+ br i1 %LGV, label %common.ret, label %F
+
+common.ret: ; preds = %F, %entry
+ ret void
+
+F: ; preds = %entry
+ store <32 x float> zeroinitializer, ptr addrspace(3) %ptr4, align 128
+ br label %common.ret
+}
+
declare void @llvm.amdgcn.iglp.opt(i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
>From a188054e7b3b95cc5cc8acf664ab2988f0779567 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 10 Apr 2025 06:59:08 -0500
Subject: [PATCH 2/2] fix test
---
.../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 74 +++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 3ce25c0fd1fef..825c3394d14c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -327,6 +327,80 @@ entry:
; - resetting global static DSWCounters for new runs.
; (reduced fuzzer-generated test case)
define amdgpu_kernel void @test_iglp_opt_flat_load(ptr %ptr1, ptr %ptr2, ptr addrspace(3) %ptr3, ptr addrspace(3) %ptr4) {
+; GCN-LABEL: test_iglp_opt_flat_load:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GCN-NEXT: s_mov_b32 s8, 0
+; GCN-NEXT: ; iglp_opt mask(0x00000001)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
+; GCN-NEXT: flat_load_ubyte v8, v[6:7]
+; GCN-NEXT: ; kill: killed $vgpr4 killed $vgpr5
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: ; kill: killed $vgpr6 killed $vgpr7
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_nle_f16_e32 vcc, 0, v3
+; GCN-NEXT: v_cmp_nge_f16_sdwa s[10:11], v3, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT: v_cmp_nge_f16_sdwa s[14:15], v2, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT: v_cmp_nge_f16_sdwa s[18:19], v0, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT: v_and_b32_e32 v5, 1, v8
+; GCN-NEXT: v_cmp_nle_f16_e64 s[0:1], 0, v2
+; GCN-NEXT: v_cmp_nle_f16_e64 s[2:3], 0, v1
+; GCN-NEXT: v_cmp_nge_f16_sdwa s[16:17], v1, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT: v_cmp_nle_f16_e64 s[4:5], 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[10:11]
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[14:15]
+; GCN-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[18:19]
+; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v5
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
+; GCN-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[16:17]
+; GCN-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GCN-NEXT: v_lshlrev_b16_e32 v0, 2, v0
+; GCN-NEXT: v_lshlrev_b16_e32 v1, 3, v1
+; GCN-NEXT: v_lshlrev_b16_e32 v2, 1, v2
+; GCN-NEXT: v_lshlrev_b16_e32 v6, 1, v6
+; GCN-NEXT: v_lshlrev_b16_e32 v4, 2, v4
+; GCN-NEXT: v_lshlrev_b16_e32 v5, 3, v5
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_and_b32_e32 v1, 3, v1
+; GCN-NEXT: v_and_b32_e32 v3, 3, v3
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_lshlrev_b16_e32 v0, 4, v0
+; GCN-NEXT: v_and_b32_e32 v1, 15, v1
+; GCN-NEXT: s_xor_b64 s[0:1], s[6:7], -1
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_mov_b32_e32 v1, s12
+; GCN-NEXT: ds_write_b8 v1, v0
+; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GCN-NEXT: s_cbranch_execz .LBB4_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_mov_b32 s10, s8
+; GCN-NEXT: s_mov_b32 s11, s8
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GCN-NEXT: v_mov_b32_e32 v4, s13
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:112
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:96
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:80
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:64
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:48
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:32
+; GCN-NEXT: ds_write_b128 v4, v[0:3] offset:16
+; GCN-NEXT: ds_write_b128 v4, v[0:3]
+; GCN-NEXT: .LBB4_2: ; %common.ret
+; GCN-NEXT: s_endpgm
entry:
%LGV2 = load <8 x half>, ptr %ptr1, align 16
%LGV = load i1, ptr %ptr2, align 1
More information about the llvm-commits
mailing list