[llvm-branch-commits] [llvm] [AMDGPU] Add gfx1251 speed model (PR #203142)
Stanislav Mekhanoshin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 10 18:47:29 PDT 2026
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/203142
>From 77825c81bdd7956ee57adfee33253c11bf32e505 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 10 Jun 2026 16:40:25 -0700
Subject: [PATCH] [AMDGPU] Add gfx1251 speed model
Adjust generic speed model to account for a slowest.
---
llvm/lib/Target/AMDGPU/GCNProcessors.td | 4 +-
llvm/lib/Target/AMDGPU/SISchedule.td | 65 +++++++++++++++++++++++--
2 files changed, 62 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index b2ca64fd56020..c2aac75a73ad0 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -342,12 +342,12 @@ def : ProcessorModel<"gfx1250", GFX1250SpeedModel,
FeatureISAVersion12_50.Features
>;
-def : ProcessorModel<"gfx1251", GFX1250SpeedModel,
+def : ProcessorModel<"gfx1251", GFX1251SpeedModel,
FeatureISAVersion12_51.Features
>;
// [gfx1250, gfx1251]
-def : ProcessorModel<"gfx12-5-generic", GFX1250SpeedModel,
+def : ProcessorModel<"gfx12-5-generic", GFX125xGenericSpeedModel,
FeatureISAVersion12_5_Generic.Features
>;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 295a7dbe5cfd7..d542d0780edcc 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -69,6 +69,7 @@ def Write16PassDGEMM : SchedWrite;
// WMMA/SWMMA instructions
def WriteXDL2PassWMMA : SchedWrite;
def WriteXDL4PassWMMA : SchedWrite;
+def WriteXDL8PassWMMA : SchedWrite;
def Write4PassWMMA : SchedWrite;
def Write8PassWMMA : SchedWrite;
def Write16PassWMMA : SchedWrite;
@@ -116,6 +117,8 @@ def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
def GFX12SpeedModel : SISchedMachineModel;
def GFX1250SpeedModel : SISchedMachineModel;
+def GFX1251SpeedModel : SISchedMachineModel;
+def GFX125xGenericSpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -508,12 +511,27 @@ def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
SchedVar<NoSchedPred, [WriteXDL2PassWMMA]>
]>;
+// Check whether matrix_a_fmt and matrix_a_fmt are both MATRIX_FMT_FP4.
+def PredIsFP4_WMMA_SCALE : SchedPredicate<[{
+ TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() == AMDGPU::WMMA::MATRIX_FMT_FP4 &&
+ TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_FP4
+}]>;
+
+// If both matrix formats are FP4, instruction takes 8 passes.
+// Otherwise it takes 4 passes.
+def WriteWMMAScaleFP4_16X16X128_F8F6F4 : SchedWriteVariant<[
+ SchedVar<PredIsFP4_WMMA_SCALE, [WriteXDL4PassWMMA]>,
+ SchedVar<NoSchedPred, [WriteXDL8PassWMMA]>
+]>;
+
multiclass GFX125xCommonWriteRes {
let ReleaseAtCycles = [8] in
def : HWWriteRes<WriteXDL2PassWMMA, [HWXDL], 8>;
let ReleaseAtCycles = [16] in
def : HWWriteRes<WriteXDL4PassWMMA, [HWXDL], 16>;
+let ReleaseAtCycles = [32] in
+def : HWWriteRes<WriteXDL8PassWMMA, [HWXDL], 32>;
def : HWWriteRes<Write4PassWMMA, [HWVALU], 16>;
def : HWWriteRes<Write8PassWMMA, [HWVALU], 32>;
@@ -536,12 +554,7 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
def : InstRW<[WriteCopy], (instrs COPY)>;
-
-def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
-def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
-def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
-def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
let NumMicroOps = 0 in {
def : HWWriteRes<WriteVALUDummy, [HWVALU], 5>;
@@ -559,4 +572,46 @@ def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 37>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 37>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 37>;
def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 38>;
+
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
} // SchedModel = GFX1250SpeedModel
+
+let SchedModel = GFX1251SpeedModel in {
+defm : GFX125xCommonWriteRes;
+
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 7>;
+
+def : InstRW<[WriteXDL8PassWMMA], (instregex "^V_WMMA_(F32|F16)_16X16X128_(FP8|BF8)_(FP8|BF8)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_SWMMAC_(F32|F16)_16X16X128_(FP8|BF8)_(FP8|BF8)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_16X16X(32|64)_.*(FP8|BF8|BF16|F16)_w32")>;
+def : InstRW<[WriteXDL8PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)")>;
+def : InstRW<[WriteWMMAScaleFP4_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
+def : InstRW<[WriteXDL8PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
+def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F64_16X16X4_F64")>;
+} // SchedModel = GFX1251SpeedModel
+
+let SchedModel = GFX125xGenericSpeedModel in {
+defm : GFX125xCommonWriteRes;
+
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 37>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 37>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 37>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 38>;
+
+def : InstRW<[WriteXDL8PassWMMA], (instregex "^V_WMMA_(F32|F16)_16X16X128_(FP8|BF8)_(FP8|BF8)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_SWMMAC_(F32|F16)_16X16X128_(FP8|BF8)_(FP8|BF8)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_16X16X(32|64)_.*(FP8|BF8|BF16|F16)_w32")>;
+def : InstRW<[WriteXDL8PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)")>;
+def : InstRW<[WriteWMMAScaleFP4_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
+def : InstRW<[WriteXDL8PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
+} // SchedModel = GFX125xGenericSpeedModel
More information about the llvm-branch-commits
mailing list