[llvm] f311f93 - [AMDGPU] gfx940 VALU hazard recognizer

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 29 11:11:47 PDT 2022


Author: Stanislav Mekhanoshin
Date: 2022-03-29T10:57:54-07:00
New Revision: f311f934e13e7c66c312c3cf1f042c01d8038c49

URL: https://github.com/llvm/llvm-project/commit/f311f934e13e7c66c312c3cf1f042c01d8038c49
DIFF: https://github.com/llvm/llvm-project/commit/f311f934e13e7c66c312c3cf1f042c01d8038c49.diff

LOG: [AMDGPU] gfx940 VALU hazard recognizer

Differntial Revision: https://reviews.llvm.org/D122339

Added: 
    llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0eeb9111665cc..f61d71b02b7a6 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -813,13 +813,136 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 }
 
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+  int WaitStatesNeeded = 0;
+
+  if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
+    const int TransDefWaitstates = 1;
+
+    auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
+      if (!SIInstrInfo::isTRANS(MI))
+        return false;
+      const SIRegisterInfo *TRI = ST.getRegisterInfo();
+      const SIInstrInfo *TII = ST.getInstrInfo();
+      Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
+
+      for (const MachineOperand &Use : VALU->explicit_uses()) {
+        if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+          return true;
+      }
+
+      return false;
+    };
+
+    int WaitStatesNeededForDef =
+        TransDefWaitstates -
+        getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+  }
+
+  if (ST.hasDstSelForwardingHazard()) {
+    const int Shift16DefWaitstates = 1;
+
+    auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
+      if (!SIInstrInfo::isVALU(MI))
+        return false;
+      const SIInstrInfo *TII = ST.getInstrInfo();
+      if (SIInstrInfo::isSDWA(MI)) {
+        if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
+          if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+            return false;
+      } else {
+        if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                        AMDGPU::OpName::op_sel) == -1) ||
+            !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
+                  ->getImm() &
+              SISrcMods::DST_OP_SEL))
+          return false;
+      }
+      const SIRegisterInfo *TRI = ST.getRegisterInfo();
+      if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+        Register Def = Dst->getReg();
+
+        for (const MachineOperand &Use : VALU->explicit_uses()) {
+          if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+            return true;
+        }
+      }
+
+      return false;
+    };
+
+    int WaitStatesNeededForDef =
+        Shift16DefWaitstates -
+        getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+  }
+
+  if (ST.hasVDecCoExecHazard()) {
+    const int VALUWriteSGPRVALUReadWaitstates = 2;
+    const int VALUWriteEXECRWLane = 4;
+    const int VALUWriteVGPRReadlaneRead = 1;
+
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register UseReg;
+    auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
+      if (!SIInstrInfo::isVALU(MI))
+        return false;
+      return MI.modifiesRegister(UseReg, TRI);
+    };
+
+    for (const MachineOperand &Use : VALU->explicit_uses()) {
+      if (!Use.isReg())
+        continue;
+
+      UseReg = Use.getReg();
+      if (TRI->isSGPRReg(MRI, UseReg)) {
+        int WaitStatesNeededForDef =
+            VALUWriteSGPRVALUReadWaitstates -
+            getWaitStatesSince(IsVALUDefSGPRFn,
+                               VALUWriteSGPRVALUReadWaitstates);
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+      }
+    }
+
+    if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
+      UseReg = AMDGPU::VCC;
+      int WaitStatesNeededForDef =
+          VALUWriteSGPRVALUReadWaitstates -
+          getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+    }
+
+    switch (VALU->getOpcode()) {
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_READFIRSTLANE_B32: {
+      MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
+      UseReg = Src->getReg();
+      int WaitStatesNeededForDef =
+          VALUWriteVGPRReadlaneRead -
+          getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+    }
+      LLVM_FALLTHROUGH;
+    case AMDGPU::V_WRITELANE_B32: {
+      UseReg = AMDGPU::EXEC;
+      int WaitStatesNeededForDef =
+          VALUWriteEXECRWLane -
+          getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
   // This checks for the hazard where VMEM instructions that store more than
   // 8 bytes can have there store data over written by the next instruction.
   if (!ST.has12DWordStoreHazard())
-    return 0;
+    return WaitStatesNeeded;
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  int WaitStatesNeeded = 0;
 
   for (const MachineOperand &Def : VALU->defs()) {
     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d48327a4aaac3..6f86bbab45a3c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -966,8 +966,19 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasLdsBranchVmemWARHazard;
   }
 
+  // Has one cycle hazard on transcendental instruction feeding a
+  // non transcendental VALU.
+  bool hasTransForwardingHazard() const { return GFX940Insts; }
+
+  // Has one cycle hazard on a VALU instruction partially writing dst with
+  // a shift of result bits feeding another VALU instruction.
+  bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+
   // Cannot use op_sel with v_dot instructions.
-  bool hasDOTOpSelHazard() const {
+  bool hasDOTOpSelHazard() const { return GFX940Insts; }
+
+  // Does not have HW interlocs for VALU writing and then reading SGPRs.
+  bool hasVDecCoExecHazard() const {
     return GFX940Insts;
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir b/llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir
new file mode 100644
index 0000000000000..7f84babc72c21
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir
@@ -0,0 +1,217 @@
+# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: trans32_write_non_trans32_read
+# GCN:      V_RCP_F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MUL_F32
+name:            trans32_write_non_trans32_read
+body:             |
+  bb.0:
+    $vgpr1 = V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: trans32_write_trans_read
+# GCN:      V_SIN_F32
+# GCN-NEXT: V_COS_F32
+name:            trans32_write_trans_read
+body:             |
+  bb.0:
+    $vgpr0 = V_SIN_F32_e32 $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: trans64_write_non_trans_read
+# GCN:      V_RCP_F64
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_OR_B32
+name:            trans64_write_non_trans_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1 = V_RCP_F64_e32 $vgpr2_vgpr3, implicit $mode, implicit $exec
+    $vgpr4 = V_OR_B32_e32 $vgpr1, $vgpr5, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: trans32_write_non_trans64_read
+# GCN:      V_EXP_F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MUL_F64
+name:            trans32_write_non_trans64_read
+body:             |
+  bb.0:
+    $vgpr1 = V_EXP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: opsel_hi16_write_valu_read
+# GCN:      V_ADD_I16
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MUL_F64
+name:            opsel_hi16_write_valu_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ADD_I16_e64 8, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec
+    $vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: opsel_lo16_write_valu_read
+# GCN:      V_ADD_I16
+# GCN-NEXT: V_MUL_F64
+name:            opsel_lo16_write_valu_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ADD_I16_e64 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec
+    $vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_hi16_write_valu_read
+# GCN:      V_MOV_B32_sdwa
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MOV_B32_e32
+name:            sdwa_hi16_write_valu_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 5, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_lo16_write_valu_read
+# GCN:      V_MOV_B32_sdwa
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MOV_B32_e32
+name:            sdwa_lo16_write_valu_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 4, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_dword_write_valu_read
+# GCN:      V_MOV_B32_sdwa
+# GCN-NEXT: V_MOV_B32_e32
+name:            sdwa_dword_write_valu_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 6, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_lo16_no_write_valu_read
+# GCN:      V_CMP_EQ_U32_sdwa
+# GCN-NEXT: V_MOV_B32_e32
+name:            sdwa_lo16_no_write_valu_read
+body:             |
+  bb.0:
+    $vcc = V_CMP_EQ_U32_sdwa 0, $vgpr1, 0, $vgpr0, 0, 5, 2, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_sgpr_valu_read_as_constant
+# GCN:      V_READFIRSTLANE_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32_e32
+name:            valu_write_sgpr_valu_read_as_constant
+body:             |
+  bb.0:
+    $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_vcc_valu_read_as_constant
+# GCN:      V_CMP_NE_U32_e32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_ADDC_U32_e32
+name:            valu_write_vcc_valu_read_as_constant
+body:             |
+  bb.0:
+    V_CMP_NE_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $exec
+    $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_sgpr_readlane_read_as_laneselect
+# GCN:      V_READFIRSTLANE_B32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_READLANE_B32
+name:            valu_write_sgpr_readlane_read_as_laneselect
+body:             |
+  bb.0:
+    $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_sgpr_writelane_read_as_laneselect
+# GCN:      V_ADD_CO_U32_e64
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_WRITELANE_B32
+name:            valu_write_sgpr_writelane_read_as_laneselect
+body:             |
+  bb.0:
+    $vgpr0, $sgpr0_sgpr1 = V_ADD_CO_U32_e64 $vgpr0, 1, 0, implicit $exec
+    $vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_valu_read_as_constant
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32_e32
+name:            vcmpx_write_exec_valu_read_as_constant
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $exec_lo, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_readlane
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_READLANE_B32
+name:            vcmpx_write_exec_readlane
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $sgpr1 = V_READLANE_B32 $vgpr1, 0, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_readfirstlane
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_READFIRSTLANE_B32
+name:            vcmpx_write_exec_readfirstlane
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_writelane
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_WRITELANE_B32
+name:            vcmpx_write_exec_writelane
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_vgpr_readlane_read
+# GCN:      V_ADD_CO_U32_e32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_READLANE_B32
+name:            valu_write_vgpr_readlane_read
+body:             |
+  bb.0:
+    $vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
+    $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_vgpr_readfirstlane_read
+# GCN:      V_ADD_CO_U32_e32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_READFIRSTLANE_B32
+name:            valu_write_vgpr_readfirstlane_read
+body:             |
+  bb.0:
+    $vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
+    $sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+...


        


More information about the llvm-commits mailing list