[clang] [llvm] [AMDGPU] First installment of IGLP_OPT 4 (MFMAValuSpacingOpt) implementation (PR #190916)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Apr 7 23:12:47 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: hidekisaito
<details>
<summary>Changes</summary>
Add MFMAValuSpacingOpt strategy in pre-RA scheduler. Tries to schedule equal
number of VALU MIR instructions between MFMAs.
Assisted-by: Cursor
---
Full diff: https://github.com/llvm/llvm-project/pull/190916.diff
7 Files Affected:
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn.cl (+4-2)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+7-2)
- (modified) llvm/lib/IR/Verifier.cpp (+10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+74-1)
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+26)
- (added) llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll (+72)
- (added) llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir (+35)
``````````diff
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 0fc40d396a87d..cd4cb103291dd 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -980,14 +980,16 @@ void test_sched_group_barrier()
// CHECK-LABEL: @test_iglp_opt
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 0)
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 1)
+// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 2)
+// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 3)
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 4)
-// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 15)
void test_iglp_opt()
{
__builtin_amdgcn_iglp_opt(0);
__builtin_amdgcn_iglp_opt(1);
+ __builtin_amdgcn_iglp_opt(2);
+ __builtin_amdgcn_iglp_opt(3);
__builtin_amdgcn_iglp_opt(4);
- __builtin_amdgcn_iglp_opt(15);
}
// CHECK-LABEL: @test_s_sleep
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f576972183eca..3efa959241266 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -377,8 +377,13 @@ def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
-// Scheduler optimization hint.
-// MASK = 0: Small gemm opt
+// Scheduler optimization hint: immediate selects the IGLP strategy in
+// AMDGPUIGroupLP.cpp (createIGLPStrategy). Must be a constant in [0,4]. Mappings:
+// 0 - MFMA small-GEMM scheduling (MFMASmallGemmOpt).
+// 1 - MFMA small-GEMM single-wave variant (MFMASmallGemmSingleWaveOpt).
+// 2 - MFMAExpInterleaveOpt (TRANS/MFMA scheduling pipeline; see AMDGPUIGroupLP.cpp).
+// 3 - MFMAExpSimpleInterleaveOpt (simpler TRANS-then-MFMA interleave pattern).
+// 4 - MFMA (or WMMA) / VALU spacing (MFMAValuSpacingOpt).
def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index cf9131c66d6c3..36e5f96215783 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7097,6 +7097,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"llvm.amdgcn.s.prefetch.data only supports global or constant memory");
break;
}
+ case Intrinsic::amdgcn_iglp_opt: {
+ const auto *Mask = dyn_cast<ConstantInt>(Call.getArgOperand(0));
+ Check(Mask, "llvm.amdgcn.iglp.opt requires a constant mask argument", &Call,
+ Call.getArgOperand(0));
+ const int64_t V = Mask->getSExtValue();
+ Check(V >= 0 && V <= 4,
+ "llvm.amdgcn.iglp.opt mask must be in the range [0,4]", &Call,
+ Call.getArgOperand(0));
+ break;
+ }
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
Value *Src0 = Call.getArgOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index d49ec90e4c212..363c34c07e2fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -823,7 +823,8 @@ enum IGLPStrategyID : int {
MFMASmallGemmOptID = 0,
MFMASmallGemmSingleWaveOptID = 1,
MFMAExpInterleaveID = 2,
- MFMAExpSimpleInterleaveID = 3
+ MFMAExpSimpleInterleaveID = 3,
+ MFMAValuSpacingOptID = 4,
};
// Implement a IGLP scheduling strategy.
@@ -896,6 +897,76 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
return true;
}
+static bool isMFMAValuSpacingGapValu(const MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ if (MI.isMetaInstruction())
+ return false;
+ return TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) &&
+ !MI.mayLoadOrStore();
+}
+
+/// Interleave MFMA/WMMA with VALU slots: each repeating stage is one MFMA (or
+/// WMMA), then up to N VALU ops per gap where N = floor(#VALU / #MFMA) in this
+/// schedule region (same predicate as \c isMFMAValuSpacingGapValu), at least 1.
+/// Template length uses MFMACount * 3 for slack, like MFMASmallGemmOpt.
+/// \p IsBottomUp is false so SchedGroup pipeline order matches forward program
+/// order (MFMA before its VALU gap).
+class MFMAValuSpacingOpt final : public IGLPStrategy {
+public:
+ bool applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) override;
+
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) override {
+ for (const MachineInstr &I : *DAG)
+ if (TII->isMFMAorWMMA(I))
+ return true;
+ return false;
+ }
+
+ MFMAValuSpacingOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = false;
+ }
+};
+
+bool MFMAValuSpacingOpt::applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) {
+ unsigned MFMACount = 0;
+ unsigned ValuCount = 0;
+ for (const MachineInstr &I : *DAG) {
+ if (TII->isMFMAorWMMA(I))
+ ++MFMACount;
+ else if (isMFMAValuSpacingGapValu(I, TII))
+ ++ValuCount;
+ }
+
+ unsigned ValuGap = 1;
+ if (MFMACount > 0) {
+ ValuGap = ValuCount / MFMACount;
+ if (ValuGap < 1)
+ ValuGap = 1;
+ }
+
+ const unsigned PipelineSyncID = 0;
+ SchedGroup *SG = nullptr;
+ for (unsigned I = 0; I < MFMACount * 3; ++I) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, ValuGap, PipelineSyncID, DAG, TII);
+ SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ return true;
+}
+
class MFMAExpInterleaveOpt final : public IGLPStrategy {
private:
// The count of TRANS SUs involved in the interleaved pipeline
@@ -2327,6 +2398,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
case MFMAExpSimpleInterleaveID:
return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
+ case MFMAValuSpacingOptID:
+ return std::make_unique<MFMAValuSpacingOpt>(DAG, TII);
}
llvm_unreachable("Unknown IGLPStrategyID");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ad24bad1fd5d7..94d13bd01af26 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -3244,6 +3244,21 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
});
}
+static bool hasIGLPOpt(ScheduleDAGInstrs *DAG, int64_t StrategyImm) {
+ return any_of(*DAG, [StrategyImm](MachineBasicBlock::iterator MI) {
+ return MI->getOpcode() == AMDGPU::IGLP_OPT && MI->getNumOperands() >= 1 &&
+ MI->getOperand(0).isImm() &&
+ MI->getOperand(0).getImm() == StrategyImm;
+ });
+}
+
+static bool hasSchedBarrier(ScheduleDAGInstrs *DAG) {
+ return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
+ unsigned Opc = MI->getOpcode();
+ return Opc == AMDGPU::SCHED_BARRIER || Opc == AMDGPU::SCHED_GROUP_BARRIER;
+ });
+}
+
GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
bool RemoveKillFlags)
@@ -3252,6 +3267,17 @@ GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
void GCNPostScheduleDAGMILive::schedule() {
HasIGLPInstrs = hasIGLPInstrs(this);
if (HasIGLPInstrs) {
+ // MFMAValuSpacingOpt is a pre-RA strategy whose interleaving is correct
+ // after the initial machine scheduler. The post-RA scheduler would undo
+ // the reordering, so preserve the pre-RA schedule by skipping here.
+ // When SCHED_[GROUP_]BARRIER coexists with IGLP_OPT, IGroupLP ignores the
+ // IGLP_OPT (they are mutually exclusive), so let post-RA scheduling proceed
+ // normally.
+ // Immediate 4 == MFMAValuSpacingOpt in AMDGPUIGroupLP.cpp (IGLPStrategyID).
+ if (hasIGLPOpt(this, 4) && !hasSchedBarrier(this)) {
+ HasIGLPInstrs = false;
+ return;
+ }
SavedMutations.clear();
SavedMutations.swap(Mutations);
addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll
new file mode 100644
index 0000000000000..fad3dde3d06bf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Full codegen on gfx950. Two MFMAs fed by loaded floats; three independent
+; i32 muls stored to a second buffer. sched.barrier(0) isolates the MUL+MFMA
+; region so that address-computation VALUs don't inflate the VALU gap in
+; MFMAValuSpacingOpt.
+;
+; With iglp_opt(4) the expected MFMA/VALU interleaving (ValuGap=1) is:
+; MFMA, MUL, MFMA, MUL, MUL
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @llvm.amdgcn.iglp.opt(i32 immarg)
+declare void @llvm.amdgcn.sched.barrier(i32 immarg)
+
+define amdgpu_kernel void @mfma_valu_iglp4(ptr addrspace(1) %p, ptr addrspace(1) %q) #0 {
+; CHECK-LABEL: mfma_valu_iglp4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v8, 2, v0
+; CHECK-NEXT: v_mov_b32_e32 v9, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[6:7], v8, s[0:1]
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; CHECK-NEXT: ; sched_barrier mask(0x00000000)
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
+; CHECK-NEXT: v_mul_lo_u32 v4, v6, v6
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
+; CHECK-NEXT: v_mul_lo_u32 v5, v6, v7
+; CHECK-NEXT: v_mul_lo_u32 v6, v7, v7
+; CHECK-NEXT: ; iglp_opt mask(0x00000004)
+; CHECK-NEXT: ; sched_barrier mask(0x00000000)
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; CHECK-NEXT: global_store_dwordx3 v8, v[4:6], s[2:3]
+; CHECK-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %t = load <4 x float>, ptr addrspace(1) %p
+ %gep0 = getelementptr inbounds float, ptr addrspace(1) %p, i32 %tid
+ %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i32 1
+ %f0 = load float, ptr addrspace(1) %gep0
+ %f1 = load float, ptr addrspace(1) %gep1
+ %i0 = bitcast float %f0 to i32
+ %i1 = bitcast float %f1 to i32
+ call void @llvm.amdgcn.sched.barrier(i32 0)
+ %m0 = mul nsw i32 %i0, %i0
+ %m1 = mul nsw i32 %i0, %i1
+ %m2 = mul nsw i32 %i1, %i1
+ call void @llvm.amdgcn.iglp.opt(i32 4)
+ %mai = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float %f1, <4 x float> %t, i32 0, i32 0, i32 0)
+ %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float %f1, <4 x float> %mai, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.sched.barrier(i32 0)
+ store <4 x float> %mai2, ptr addrspace(1) %p
+ %qgep0 = getelementptr inbounds i32, ptr addrspace(1) %q, i32 %tid
+ %qgep1 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 1
+ %qgep2 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 2
+ store i32 %m0, ptr addrspace(1) %qgep0
+ store i32 %m1, ptr addrspace(1) %qgep1
+ store i32 %m2, ptr addrspace(1) %qgep2
+ ret void
+}
+
+attributes #0 = { "uniform-work-group-size"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir
new file mode 100644
index 0000000000000..7f173ddc8e5aa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# Pre-RA machine scheduler with IGroupLP / MFMAValuSpacingOpt (iglp_opt(4)).
+# With IGLP_OPT 4 the expected MFMA/VALU interleaving (ValuGap=1) is:
+# MFMA, MUL, MFMA, MUL, MUL
+#
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: mfma_valu_iglp4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: mfma_valu_iglp4
+ ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF]], [[DEF]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF1]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: IGLP_OPT 4
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_]], implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_1]]
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128_align2 = IMPLICIT_DEF
+ %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %0, implicit $exec
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %1, implicit $exec
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %1, %1, implicit $exec
+ %6:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %7:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %6, 0, 0, 0, implicit $mode, implicit $exec
+ IGLP_OPT 4
+ S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %7
+
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/190916
More information about the cfe-commits
mailing list