[llvm] AMDGPU: Correct cycle counts for f64 mfma on gfx940 (PR #83782)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 4 02:34:23 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/83782
>From 49d51f6d5a199431b28e0999e0a76f888d78d1d8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 4 Mar 2024 13:10:39 +0530
Subject: [PATCH] AMDGPU: Correct cycle counts for f64 mfma on gfx940/gfx90a
The manual states these are 4 and 8 pass instructions. I'm also
not sure if reporting these as using VALU and not XDL resource is
correct. The Latency and ReleaseAtCycles values were also mismatched,
which I'm also not sure was intentional or not.
---
llvm/lib/Target/AMDGPU/SISchedule.td | 4 +-
.../CodeGen/AMDGPU/mai-hazards-gfx90a.mir | 6 +--
.../CodeGen/AMDGPU/mai-hazards-gfx940.mir | 12 +++---
llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s | 38 ++++++++++++++++
llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s | 43 +++++++++++++------
5 files changed, 77 insertions(+), 26 deletions(-)
create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index b0e8e4112254d8..a60b1f28e9d34c 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ let ReleaseAtCycles = [4] in
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
- def : HWVALUWriteRes<Write8PassDGEMM, 16>;
+ let ReleaseAtCycles = [8] in
+ def : HWVALUWriteRes<Write8PassDGEMM, 8>;
let ReleaseAtCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 8be7308c8a6e00..e6c26090835a68 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -481,8 +481,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 2
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_vgpr_flat_read
body: |
@@ -1219,8 +1218,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 2
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_agpr_flat_read
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 4d307a444b19c6..21b9afcace419e 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -749,7 +749,6 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_vgpr_flat_read
@@ -804,7 +803,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MOV_B32
name: dmfma16x16_write_vgpr_valu_read
body: |
@@ -868,7 +867,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_DOT
name: dmfma16x16_write_vgpr_dot_read
body: |
@@ -988,7 +987,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MOV_B32
name: dmfma16x16_write_vgpr_valu_write
body: |
@@ -1484,7 +1483,6 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_agpr_flat_read
@@ -1506,7 +1504,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
name: dmfma16x16_write_agpr_valu_read
body: |
@@ -1527,7 +1525,7 @@ body: |
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: dmfma16x16_write_agpr_valu_write
body: |
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
new file mode 100644
index 00000000000000..6b4ddb3f000c2c
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - HWBranch
+# CHECK-NEXT: [1] - HWExport
+# CHECK-NEXT: [2] - HWLGKM
+# CHECK-NEXT: [3] - HWSALU
+# CHECK-NEXT: [4] - HWVALU
+# CHECK-NEXT: [5] - HWVMEM
+# CHECK-NEXT: [6] - HWXDL
+
+# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
index 323acbccfe5499..1239f4462cca6c 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
@@ -2,18 +2,9 @@
# CHECK: Iterations: 1
# CHECK: Instructions: 58
-# CHECK: Total Cycles: 543
+# CHECK: Total Cycles: 545
# CHECK: Total uOps: 58
-# CHECK: Resources:
-# CHECK: [0] - HWBranch
-# CHECK: [1] - HWExport
-# CHECK: [2] - HWLGKM
-# CHECK: [3] - HWSALU
-# CHECK: [4] - HWVALU
-# CHECK: [5] - HWVMEM
-# CHECK: [6] - HWXDL
-
v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
@@ -101,15 +92,39 @@ v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9
v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1
v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11
+# CHECK: Instruction Info:
+# CHECK-NEXT:[1]: #uOps
+# CHECK-NEXT:[2]: Latency
+# CHECK-NEXT:[3]: RThroughput
+# CHECK-NEXT:[4]: MayLoad
+# CHECK-NEXT:[5]: MayStore
+# CHECK-NEXT:[6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+
+# CHECK: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
+# CHECK: Resources:
+# CHECK: [0] - HWBranch
+# CHECK: [1] - HWExport
+# CHECK: [2] - HWLGKM
+# CHECK: [3] - HWSALU
+# CHECK: [4] - HWVALU
+# CHECK: [5] - HWVMEM
+# CHECK: [6] - HWXDL
+
# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]
More information about the llvm-commits
mailing list