[llvm] 7fc52d7 - [AMDGPU] Fix DGEMM hazard for GFX90a

Vang Thao via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 1 11:59:26 PDT 2022


Author: Vang Thao
Date: 2022-08-01T11:56:22-07:00
New Revision: 7fc52d7c8b114ea57003a30b9acca92a9797d274

URL: https://github.com/llvm/llvm-project/commit/7fc52d7c8b114ea57003a30b9acca92a9797d274
DIFF: https://github.com/llvm/llvm-project/commit/7fc52d7c8b114ea57003a30b9acca92a9797d274.diff

LOG: [AMDGPU] Fix DGEMM hazard for GFX90a

For VALU write and memory (VM, L/DS, FLAT) instructions, SQ would insert
wait-states to avoid data hazard. However when there is a DGEMM instruction
in-between them, SQ incorrectly disables the wait-states thus the data hazard
needs to be handled with this workaround.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D130677

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
    llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 481ecafd20ee..44542b352b58 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2268,12 +2268,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
   if (SIInstrInfo::isMFMA(*MI))
     return 0;
 
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
   int WaitStatesNeeded = 0;
 
-  bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
-                       SIInstrInfo::isFLAT(*MI) ||
-                       SIInstrInfo::isDS(*MI) ||
-                       SIInstrInfo::isEXP(*MI);
+  bool IsMem = SIInstrInfo::isVMEM(*MI) ||
+               SIInstrInfo::isFLAT(*MI) ||
+               SIInstrInfo::isDS(*MI);
+  bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
   bool IsVALU = SIInstrInfo::isVALU(*MI);
 
   const MachineInstr *MFMA = nullptr;
@@ -2295,6 +2297,20 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     return true;
   };
 
+  bool DGEMMAfterVALUWrite = false;
+  auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
+    // Found DGEMM on reverse traversal to def.
+    if (isDGEMM(MI.getOpcode()))
+      DGEMMAfterVALUWrite = true;
+
+    // Only hazard if register is defined by a VALU and a DGEMM is found after
+    // after the def.
+    if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
+      return false;
+
+    return true;
+  };
+
   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src2);
 
@@ -2316,6 +2332,7 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
     const int DotWriteSameDotReadSrcAB = 3;
     const int DotWriteDifferentVALURead = 3;
+    const int DMFMABetweenVALUWriteVMEMRead = 2;
     const int MaxWaitStates = 19;
 
     for (const MachineOperand &Use : MI->explicit_uses()) {
@@ -2339,6 +2356,22 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
       }
 
+      // Workaround for HW data hazard bug observed only in GFX90A. When there
+      // is a DGEMM instruction in-between a VALU and a VMEM instruction it
+      // causes the SQ to incorrectly not insert two wait states between the two
+      // instructions needed to avoid data hazard.
+      if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
+        DGEMMAfterVALUWrite = false;
+        if (TRI.isVectorRegister(MRI, Reg)) {
+          int WaitStatesNeededForUse =
+                DMFMABetweenVALUWriteVMEMRead -
+                getWaitStatesSinceDef(Reg, IsDGEMMHazard,
+                                      DMFMABetweenVALUWriteVMEMRead);
+
+          WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+        }
+      }
+
       MFMA = nullptr;
       WaitStatesSinceDef =
           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 73749edefb2e..e33094ae0bac 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -1308,3 +1308,178 @@ body:             |
     $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
 ...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            dgemm_between_valu_write_buffer_store
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_load
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: BUFFER_LOAD_DWORD
+name:            dgemm_between_valu_write_buffer_load
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_global_store
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: GLOBAL_STORE_DWORD
+
+name:            dgemm_between_valu_write_global_store
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_global_load
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: GLOBAL_LOAD_DWORD
+name:            dgemm_between_valu_write_global_load
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_ds_write
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: DS_WRITE_B32
+name:            dgemm_between_valu_write_ds_write
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B32 $vgpr1, $vgpr0, 0, 0, implicit $m0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_ds_read
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: DS_READ_B32_gfx9
+name:            dgemm_between_valu_write_ds_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_flat_store
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dgemm_between_valu_write_flat_store
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr2, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_flat_load
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_LOAD_DWORD
+name:            dgemm_between_valu_write_flat_load
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_scratch_store
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: SCRATCH_STORE_DWORD
+name:            dgemm_between_valu_write_scratch_store
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    SCRATCH_STORE_DWORD $vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_scratch_load
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: SCRATCH_LOAD_DWORD
+name:            dgemm_between_valu_write_scratch_load
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough1
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN:      bb.1:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            dgemm_between_valu_write_buffer_store_fallthrough1
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+
+  bb.1:
+    BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough2
+# GCN:      V_MOV_B32_e32
+# GCN:      bb.1:
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            dgemm_between_valu_write_buffer_store_fallthrough2
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+
+  bb.1:
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough3
+# GCN:      V_MOV_B32_e32
+# GCN:      bb.1:
+# GCN:      bb.2:
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            dgemm_between_valu_write_buffer_store_fallthrough3
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+
+  bb.1:
+
+  bb.2:
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 91dcfd7a6b3e..bdb947ffadc8 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -2016,3 +2016,15 @@ body:             |
     $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
 ...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_no_snop
+# GCN:      V_MOV_B32_e32
+# GCN-NEXT: V_MFMA_F64
+# GCN-NOT:  S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            dgemm_between_valu_write_buffer_store_no_snop
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...


        


More information about the llvm-commits mailing list