[llvm] c837f57 - AMDGPU: Handle gfx950 XDL-write-VGPR-Overlap-Src-AB wait state (#126732)

via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 11 07:30:22 PST 2025


Author: Vigneshwar Jayakumar
Date: 2025-02-11T22:30:16+07:00
New Revision: c837f572865eb2980b82a8415da45dc1157627bf

URL: https://github.com/llvm/llvm-project/commit/c837f572865eb2980b82a8415da45dc1157627bf
DIFF: https://github.com/llvm/llvm-project/commit/c837f572865eb2980b82a8415da45dc1157627bf.diff

LOG: AMDGPU: Handle gfx950 XDL-write-VGPR-Overlap-Src-AB wait state (#126732)

gfx950 needs more additional waitstates from gfx940

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
    llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a21702af11a984..73b44680aad5d7 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2290,12 +2290,14 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
   return NumPasses + 2;
 }
 
-static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
-  // 2 pass -> 5
-  // 4 pass -> 7
-  // 8 pass -> 11
-  // 16 pass -> 19
-  return NumPasses + 3;
+static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
+                                                                bool IsGFX950) {
+  // xdl def cycles | gfx940 | gfx950
+  // 2 pass         |  5        5
+  // 4 pass         |  7        8
+  // 8 pass         |  11       12
+  // 16 pass        |  19       20
+  return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
 }
 
 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
@@ -2464,7 +2466,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
           NeedWaitStates =
               isXDL(ST, *MI1)
                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
-                        NumPasses)
+                        NumPasses, ST.hasGFX950Insts())
                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
                         NumPasses);
           break;

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 52891989b88fbd..1eb7ec4c142f20 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -417,7 +417,8 @@ body:             |
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -439,7 +440,8 @@ body:             |
 # GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
 # GCN:      V_SMFMAC
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            smfmac32x32_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -450,7 +452,8 @@ body:             |
 # GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 # GCN:      V_SMFMAC
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_SMFMAC
 name:            smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 body:             |
@@ -462,7 +465,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -1715,7 +1719,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -1725,7 +1730,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
 body:             |
@@ -1735,7 +1741,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
 body:             |
@@ -1826,7 +1833,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_SMFMAC
 name:            smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
 body:             |
@@ -2188,7 +2196,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2202,7 +2211,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
@@ -2276,7 +2286,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2290,7 +2301,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2321,7 +2333,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2336,7 +2349,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2370,7 +2384,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2386,7 +2401,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2456,7 +2472,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2470,7 +2487,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
@@ -2502,7 +2520,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2519,7 +2538,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
index 433236180b1375..4585eca8fe894a 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
@@ -254,7 +254,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 2
+    ; GCN-NEXT: S_NOP 3
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
@@ -275,7 +275,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 6
+    ; GCN-NEXT: S_NOP 7
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec


        


More information about the llvm-commits mailing list