[llvm-branch-commits] [llvm] AMDGPU: Define new sched model for gfx950 (PR #117261)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Nov 21 15:07:45 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
A few instructions changed rate.
---
Patch is 306.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117261.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNProcessors.td (+1-1)
- (modified) llvm/lib/Target/AMDGPU/SISchedule.td (+63)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+1360-820)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+192-120)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+456-253)
- (modified) llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir (+24-13)
- (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+302-15)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 3403cbab526d46..508f2dd83108d9 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -204,7 +204,7 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_4_2.Features
>;
-def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel,
+def : ProcessorModel<"gfx950", SIDPGFX950FullSpeedModel,
FeatureISAVersion9_5_0.Features
>;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index a60b1f28e9d34c..117add324db565 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -64,6 +64,7 @@ def Write8PassMAI : SchedWrite;
def Write16PassMAI : SchedWrite;
def Write4PassDGEMM : SchedWrite;
def Write8PassDGEMM : SchedWrite;
+def Write16PassDGEMM : SchedWrite;
// Scalar float instructions
def WriteSFPU : SchedWrite;
@@ -94,6 +95,7 @@ def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
+def SIDPGFX950FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
def GFX12SpeedModel : SISchedMachineModel;
@@ -169,6 +171,8 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
let ReleaseAtCycles = [8] in
def : HWVALUWriteRes<Write8PassDGEMM, 8>;
+ let ReleaseAtCycles = [16] in
+ def : HWVALUWriteRes<Write16PassDGEMM, 16>;
let ReleaseAtCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
@@ -201,6 +205,13 @@ def WriteCopy : SchedWriteVariant<[
SchedVar<PredIsVGPR64Copy, [Write64Bit]>,
SchedVar<NoSchedPred, [WriteSALU]>]>;
+// Check if any matrix inputs are interpreted as f8 in an f8f6f4 mfma
+// instruction.
+def PredIsF8_MFMA_SCALE : SchedPredicate<[{
+ TII->getNamedOperand(*MI, AMDGPU::OpName::cbsz)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2 ||
+ TII->getNamedOperand(*MI, AMDGPU::OpName::blgp)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2
+}]>;
+
let SchedModel = SIFullSpeedModel in {
defm : SICommonWriteRes;
@@ -299,6 +310,58 @@ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;
} // End SchedModel = SIDPGFX940FullSpeedModel
+
+let SchedModel = SIDPGFX950FullSpeedModel in {
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 1>;
+def : HWVALUWriteRes<WriteDoubleAdd, 1>;
+def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
+def : HWVALUWriteRes<WriteIntMul, 1>;
+def : HWVALUWriteRes<Write64Bit, 1>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X64")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>;
+
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X32_")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>;
+
+def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
+def : InstRW<[Write16PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
+
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;
+
+
+// If either matrix format is f8, the instruction takes 2x as many
+// cycles. TODO: This isn't reflected in MCA.
+def WriteMFMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
+ SchedVar<PredIsF8_MFMA_SCALE, [Write8PassMAI]>,
+ SchedVar<NoSchedPred, [Write4PassMAI]>]>;
+def WriteMFMAScale_32X32X64_F8F6F4 : SchedWriteVariant<[
+ SchedVar<PredIsF8_MFMA_SCALE, [Write16PassMAI]>,
+ SchedVar<NoSchedPred, [Write8PassMAI]>]>;
+
+def : InstRW<[WriteMFMAScale_16X16X128_F8F6F4, MIMFMARead],
+ (instregex "^V_MFMA(_SCALE)?_.32_16X16X128_F8F6F4")>;
+def : InstRW<[WriteMFMAScale_32X32X64_F8F6F4, MIMFMARead],
+ (instregex "^V_MFMA(_SCALE)?_.32_32X32X64_F8F6F4")>;
+
+} // End SchedModel = SIDPGFX950FullSpeedModel
+
+
let SchedModel = GFX10SpeedModel in {
// The latency values are 1 / (operations / cycle).
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index d0ae669ffb3d68..5d149f7c0c62ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@@ -49,52 +49,366 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
ret <4 x float> %result
}
-define <4 x float> @test_mfma_f32_16x16x32_f16__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) {
-; GCN-LABEL: test_mfma_f32_16x16x32_f16__mac:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v0
-; GCN-NEXT: v_accvgpr_write_b32 a1, v1
-; GCN-NEXT: v_accvgpr_write_b32 a2, v2
-; GCN-NEXT: v_accvgpr_write_b32 a3, v3
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3]
-; GCN-NEXT: s_nop 6
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_nop 5
+; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
- ret <4 x float> %result
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
}
-define <4 x float> @test_mfma_f32_16x16x32_f16___flags__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) {
-; GCN-LABEL: test_mfma_f32_16x16x32_f16___flags__mac:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v0
-; GCN-NEXT: v_accvgpr_write_b32 a1, v1
-; GCN-NEXT: v_accvgpr_write_b32 a2, v2
-; GCN-NEXT: v_accvgpr_write_b32 a3, v3
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 6
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
- ret <4 x float> %result
+define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_nop 5
+; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.mfma.f32.32x32x16.f16
; --------------------------------------------------------------------
-define <16 x float> @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_32x32x16_f16:
+define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 {
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+ store volatile <16 x float> %result, ptr addrspace(1) null
+ store volatile <16 x float> %arg2, ptr addrspace(1) null
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 {
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/117261
More information about the llvm-branch-commits
mailing list