[llvm] 0f3628a - AMDGPU: Correct cycle counts for f64 mfma on gfx940 (#83782)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 5 20:06:05 PST 2024


Author: Matt Arsenault
Date: 2024-03-06T09:36:01+05:30
New Revision: 0f3628a93749433df51b763ff675152d82a25973

URL: https://github.com/llvm/llvm-project/commit/0f3628a93749433df51b763ff675152d82a25973
DIFF: https://github.com/llvm/llvm-project/commit/0f3628a93749433df51b763ff675152d82a25973.diff

LOG: AMDGPU: Correct cycle counts for f64 mfma on gfx940 (#83782)

Added: 
    llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/SISchedule.td
    llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 00fa93cc1923af..7bed0d8ef0d670 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2538,23 +2538,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
         break;
       case 8:
         NeedWaitStates =
-          ST.hasGFX940Insts()
-            ? isXDL(ST, *MFMA)
-              ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
-              : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
-            : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+            isDGEMM(MFMA->getOpcode())
+                ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
+                                : DMFMA16x16WriteVgprVALUReadWaitStates
+            : ST.hasGFX940Insts()
+                ? isXDL(ST, *MFMA)
+                      ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
+                      : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
+                : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
         break;
       case 16: [[fallthrough]];
       default:
+        assert(!isDGEMM(MFMA->getOpcode()));
         NeedWaitStates =
-          isDGEMM(MFMA->getOpcode())
-            ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
-                            : DMFMA16x16WriteVgprVALUReadWaitStates
-            : ST.hasGFX940Insts()
-              ? isXDL(ST, *MFMA)
-                ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
-                : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
-              : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+            ST.hasGFX940Insts()
+                ? isXDL(ST, *MFMA)
+                      ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
+                      : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
+                : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
         break;
       }
 
@@ -2633,21 +2634,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
               : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
         break;
       case 8:
-        NeedWaitStates = ST.hasGFX940Insts()
-          ? isXDL(ST, *MFMA)
-            ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
-            : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
-          : SMFMA16x16WriteVgprVALUWawWaitStates;
+        NeedWaitStates =
+            isDGEMM(MFMA->getOpcode()) ? DMFMA16x16WriteVgprVALUWriteWaitStates
+            :
+
+            ST.hasGFX940Insts()
+                ? isXDL(ST, *MFMA) ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
+                                   : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
+                : SMFMA16x16WriteVgprVALUWawWaitStates;
         break;
       case 16: [[fallthrough]];
       default:
-        NeedWaitStates = isDGEMM(MFMA->getOpcode())
-                   ? DMFMA16x16WriteVgprVALUWriteWaitStates
-                   : ST.hasGFX940Insts()
-                     ? isXDL(ST, *MFMA)
-                       ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
-                       : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
-                   : SMFMA32x32WriteVgprVALUWawWaitStates;
+        assert(!isDGEMM(MFMA->getOpcode()));
+        NeedWaitStates =
+            ST.hasGFX940Insts()
+                ? isXDL(ST, *MFMA)
+                      ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
+                      : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
+                : SMFMA32x32WriteVgprVALUWawWaitStates;
         break;
       }
 

diff  --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index b0e8e4112254d8..a60b1f28e9d34c 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
   def : HWVALUWriteRes<WriteTrans32,       4>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 
+  let ReleaseAtCycles = [4] in
   def : HWVALUWriteRes<Write4PassDGEMM,    4>;
-  def : HWVALUWriteRes<Write8PassDGEMM,   16>;
+  let ReleaseAtCycles = [8] in
+  def : HWVALUWriteRes<Write8PassDGEMM,    8>;
 
   let ReleaseAtCycles = [2] in
   def : HWWriteRes<Write2PassMAI,  [HWXDL], 2>;

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
new file mode 100644
index 00000000000000..6b4ddb3f000c2c
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:     [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT: 1      8     4.00                  U     v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: 1      8     4.00                  U     v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
+
+# CHECK: Resources:
+# CHECK-NEXT: [0]   - HWBranch
+# CHECK-NEXT: [1]   - HWExport
+# CHECK-NEXT: [2]   - HWLGKM
+# CHECK-NEXT: [3]   - HWSALU
+# CHECK-NEXT: [4]   - HWVALU
+# CHECK-NEXT: [5]   - HWVMEM
+# CHECK-NEXT: [6]   - HWXDL
+
+# CHECK:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
index e7ddeee97ea924..0e1efbe90805b0 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
@@ -2,7 +2,7 @@
 
 # CHECK: Iterations:        1
 # CHECK: Instructions:      78
-# CHECK: Total Cycles:      699
+# CHECK: Total Cycles:      701
 # CHECK: Total uOps:        78
 
 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
@@ -128,11 +128,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
 # CHECK-NEXT:[6]: HasSideEffects (U)
 
 # CHECK:     [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-
-# CHECK:      1      8     1.00                  U     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: 1      8     1.00                  U     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: 1      20    1.00                  U     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: 1      20    1.00                  U     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+# CHECK:      1      8     4.00                  U     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: 1      8     4.00                  U     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1      12    8.00                  U     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
 
 # CHECK: Resources:
 # CHECK: [0]   - HWBranch
@@ -148,10 +147,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
 # CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
 # CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
 # CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: -      -      -      -     4.00    -      -     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: -      -      -      -     8.00    -      -     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
 # CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
 # CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
 # CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]


        


More information about the llvm-commits mailing list