[llvm] AMDGPU: Correct cycle counts for f64 mfma on gfx940 (PR #83782)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 4 02:54:09 PST 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/83782

>From f68b40423728081987d8d9549b98f30226d63289 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 4 Mar 2024 13:10:39 +0530
Subject: [PATCH] AMDGPU: Correct cycle counts for f64 mfma on gfx940/gfx90a

The manual states these are 4 and 8 pass instructions. I'm also
not sure if reporting these as using VALU and not XDL resource is
correct. The Latency and ReleaseAtCycles values were also mismatched,
which I'm also not sure was intentional or not.
---
 llvm/lib/Target/AMDGPU/SISchedule.td          |  4 +-
 .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll     | 12 ++----
 .../CodeGen/AMDGPU/mai-hazards-gfx90a.mir     |  6 +--
 .../CodeGen/AMDGPU/mai-hazards-gfx940.mir     | 12 +++---
 llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s | 38 ++++++++++++++++
 llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s | 43 +++++++++++++------
 6 files changed, 81 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s

diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index b0e8e4112254d8..a60b1f28e9d34c 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
   def : HWVALUWriteRes<WriteTrans32,       4>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 
+  let ReleaseAtCycles = [4] in
   def : HWVALUWriteRes<Write4PassDGEMM,    4>;
-  def : HWVALUWriteRes<Write8PassDGEMM,   16>;
+  let ReleaseAtCycles = [8] in
+  def : HWVALUWriteRes<Write8PassDGEMM,    8>;
 
   let ReleaseAtCycles = [2] in
   def : HWWriteRes<Write2PassMAI,  [HWXDL], 2>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index e7faabb72ab691..3cb4ba28ebb1e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -277,8 +277,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GCN-NEXT:    s_endpgm
@@ -302,8 +301,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
 ; GCN-NEXT:    s_endpgm
@@ -338,8 +336,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GCN-NEXT:    s_endpgm
@@ -374,8 +371,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 8be7308c8a6e00..e6c26090835a68 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -481,8 +481,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 2
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_vgpr_flat_read
 body:             |
@@ -1219,8 +1218,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 2
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_agpr_flat_read
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 4d307a444b19c6..21b9afcace419e 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -749,7 +749,6 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_vgpr_flat_read
@@ -804,7 +803,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_read
 body:             |
@@ -868,7 +867,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_DOT
 name:            dmfma16x16_write_vgpr_dot_read
 body:             |
@@ -988,7 +987,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_write
 body:             |
@@ -1484,7 +1483,6 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_agpr_flat_read
@@ -1506,7 +1504,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            dmfma16x16_write_agpr_valu_read
 body:             |
@@ -1527,7 +1525,7 @@ body:             |
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            dmfma16x16_write_agpr_valu_write
 body:             |
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
new file mode 100644
index 00000000000000..6b4ddb3f000c2c
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:     [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT: 1      8     4.00                  U     v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: 1      8     4.00                  U     v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
+
+# CHECK: Resources:
+# CHECK-NEXT: [0]   - HWBranch
+# CHECK-NEXT: [1]   - HWExport
+# CHECK-NEXT: [2]   - HWLGKM
+# CHECK-NEXT: [3]   - HWSALU
+# CHECK-NEXT: [4]   - HWVALU
+# CHECK-NEXT: [5]   - HWVMEM
+# CHECK-NEXT: [6]   - HWXDL
+
+# CHECK:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
index 323acbccfe5499..1239f4462cca6c 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
@@ -2,18 +2,9 @@
 
 # CHECK: Iterations:        1
 # CHECK: Instructions:      58
-# CHECK: Total Cycles:      543
+# CHECK: Total Cycles:      545
 # CHECK: Total uOps:        58
 
-# CHECK: Resources:
-# CHECK: [0]   - HWBranch
-# CHECK: [1]   - HWExport
-# CHECK: [2]   - HWLGKM
-# CHECK: [3]   - HWSALU
-# CHECK: [4]   - HWVALU
-# CHECK: [5]   - HWVMEM
-# CHECK: [6]   - HWXDL
-
 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
 
@@ -101,15 +92,39 @@ v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9
 v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1
 v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11
 
+# CHECK: Instruction Info:
+# CHECK-NEXT:[1]: #uOps
+# CHECK-NEXT:[2]: Latency
+# CHECK-NEXT:[3]: RThroughput
+# CHECK-NEXT:[4]: MayLoad
+# CHECK-NEXT:[5]: MayStore
+# CHECK-NEXT:[6]: HasSideEffects (U)
+
+# CHECK:     [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+
+# CHECK:      1      8     4.00                  U     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: 1      8     4.00                  U     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
+# CHECK: Resources:
+# CHECK: [0]   - HWBranch
+# CHECK: [1]   - HWExport
+# CHECK: [2]   - HWLGKM
+# CHECK: [3]   - HWSALU
+# CHECK: [4]   - HWVALU
+# CHECK: [5]   - HWVMEM
+# CHECK: [6]   - HWXDL
+
 # CHECK:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
 # CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
 # CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
 # CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
 # CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
 # CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
 # CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]



More information about the llvm-commits mailing list