[llvm] c837f57 - AMDGPU: Handle gfx950 XDL-write-VGPR-Overlap-Src-AB wait state (#126732)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 11 07:30:22 PST 2025
Author: Vigneshwar Jayakumar
Date: 2025-02-11T22:30:16+07:00
New Revision: c837f572865eb2980b82a8415da45dc1157627bf
URL: https://github.com/llvm/llvm-project/commit/c837f572865eb2980b82a8415da45dc1157627bf
DIFF: https://github.com/llvm/llvm-project/commit/c837f572865eb2980b82a8415da45dc1157627bf.diff
LOG: AMDGPU: Handle gfx950 XDL-write-VGPR-Overlap-Src-AB wait state (#126732)
gfx950 needs more additional waitstates from gfx940
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a21702af11a984..73b44680aad5d7 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2290,12 +2290,14 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
return NumPasses + 2;
}
-static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
- // 2 pass -> 5
- // 4 pass -> 7
- // 8 pass -> 11
- // 16 pass -> 19
- return NumPasses + 3;
+static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
+ bool IsGFX950) {
+ // xdl def cycles | gfx940 | gfx950
+ // 2 pass | 5 5
+ // 4 pass | 7 8
+ // 8 pass | 11 12
+ // 16 pass | 19 20
+ return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
}
int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
@@ -2464,7 +2466,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
NeedWaitStates =
isXDL(ST, *MI1)
? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
- NumPasses)
+ NumPasses, ST.hasGFX950Insts())
: GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
NumPasses);
break;
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 52891989b88fbd..1eb7ec4c142f20 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -417,7 +417,8 @@ body: |
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
body: |
@@ -439,7 +440,8 @@ body: |
# GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: smfmac32x32_write_agpr_mfma_srca_read_overlap
body: |
@@ -450,7 +452,8 @@ body: |
# GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_SMFMAC
name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
body: |
@@ -462,7 +465,8 @@ body: |
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
body: |
@@ -1715,7 +1719,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
body: |
@@ -1725,7 +1730,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
body: |
@@ -1735,7 +1741,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
body: |
@@ -1826,7 +1833,8 @@ body: |
...
# GCN-LABEL: name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
# GCN: V_SMFMAC
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_SMFMAC
name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
body: |
@@ -2188,7 +2196,8 @@ body: |
# 4 pass source
# GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca
body: |
@@ -2202,7 +2211,8 @@ body: |
# 4 pass source
# GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb
body: |
@@ -2276,7 +2286,8 @@ body: |
# 4 pass source
# GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca
body: |
@@ -2290,7 +2301,8 @@ body: |
# 4 pass source
# GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb
# GCN: V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
# GCN-NEXT: V_MFMA
name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb
body: |
@@ -2321,7 +2333,8 @@ body: |
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
body: |
@@ -2336,7 +2349,8 @@ body: |
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
body: |
@@ -2370,7 +2384,8 @@ body: |
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
body: |
@@ -2386,7 +2401,8 @@ body: |
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
body: |
@@ -2456,7 +2472,8 @@ body: |
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
body: |
@@ -2470,7 +2487,8 @@ body: |
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
body: |
@@ -2502,7 +2520,8 @@ body: |
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
body: |
@@ -2519,7 +2538,8 @@ body: |
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
index 433236180b1375..4585eca8fe894a 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
@@ -254,7 +254,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
- ; GCN-NEXT: S_NOP 2
+ ; GCN-NEXT: S_NOP 3
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
@@ -275,7 +275,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
- ; GCN-NEXT: S_NOP 6
+ ; GCN-NEXT: S_NOP 7
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
More information about the llvm-commits
mailing list