[llvm] f311f93 - [AMDGPU] gfx940 VALU hazard recognizer
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 29 11:11:47 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-03-29T10:57:54-07:00
New Revision: f311f934e13e7c66c312c3cf1f042c01d8038c49
URL: https://github.com/llvm/llvm-project/commit/f311f934e13e7c66c312c3cf1f042c01d8038c49
DIFF: https://github.com/llvm/llvm-project/commit/f311f934e13e7c66c312c3cf1f042c01d8038c49.diff
LOG: [AMDGPU] gfx940 VALU hazard recognizer
Differntial Revision: https://reviews.llvm.org/D122339
Added:
llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0eeb9111665cc..f61d71b02b7a6 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -813,13 +813,136 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
}
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+ int WaitStatesNeeded = 0;
+
+ if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
+ const int TransDefWaitstates = 1;
+
+ auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
+ if (!SIInstrInfo::isTRANS(MI))
+ return false;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
+
+ for (const MachineOperand &Use : VALU->explicit_uses()) {
+ if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+ return true;
+ }
+
+ return false;
+ };
+
+ int WaitStatesNeededForDef =
+ TransDefWaitstates -
+ getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+
+ if (ST.hasDstSelForwardingHazard()) {
+ const int Shift16DefWaitstates = 1;
+
+ auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
+ if (!SIInstrInfo::isVALU(MI))
+ return false;
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (SIInstrInfo::isSDWA(MI)) {
+ if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
+ if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+ return false;
+ } else {
+ if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::op_sel) == -1) ||
+ !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
+ ->getImm() &
+ SISrcMods::DST_OP_SEL))
+ return false;
+ }
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+ Register Def = Dst->getReg();
+
+ for (const MachineOperand &Use : VALU->explicit_uses()) {
+ if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ int WaitStatesNeededForDef =
+ Shift16DefWaitstates -
+ getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+
+ if (ST.hasVDecCoExecHazard()) {
+ const int VALUWriteSGPRVALUReadWaitstates = 2;
+ const int VALUWriteEXECRWLane = 4;
+ const int VALUWriteVGPRReadlaneRead = 1;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register UseReg;
+ auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
+ if (!SIInstrInfo::isVALU(MI))
+ return false;
+ return MI.modifiesRegister(UseReg, TRI);
+ };
+
+ for (const MachineOperand &Use : VALU->explicit_uses()) {
+ if (!Use.isReg())
+ continue;
+
+ UseReg = Use.getReg();
+ if (TRI->isSGPRReg(MRI, UseReg)) {
+ int WaitStatesNeededForDef =
+ VALUWriteSGPRVALUReadWaitstates -
+ getWaitStatesSince(IsVALUDefSGPRFn,
+ VALUWriteSGPRVALUReadWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ }
+
+ if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
+ UseReg = AMDGPU::VCC;
+ int WaitStatesNeededForDef =
+ VALUWriteSGPRVALUReadWaitstates -
+ getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+
+ switch (VALU->getOpcode()) {
+ case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READFIRSTLANE_B32: {
+ MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
+ UseReg = Src->getReg();
+ int WaitStatesNeededForDef =
+ VALUWriteVGPRReadlaneRead -
+ getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ LLVM_FALLTHROUGH;
+ case AMDGPU::V_WRITELANE_B32: {
+ UseReg = AMDGPU::EXEC;
+ int WaitStatesNeededForDef =
+ VALUWriteEXECRWLane -
+ getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
// This checks for the hazard where VMEM instructions that store more than
// 8 bytes can have there store data over written by the next instruction.
if (!ST.has12DWordStoreHazard())
- return 0;
+ return WaitStatesNeeded;
const MachineRegisterInfo &MRI = MF.getRegInfo();
- int WaitStatesNeeded = 0;
for (const MachineOperand &Def : VALU->defs()) {
WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d48327a4aaac3..6f86bbab45a3c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -966,8 +966,19 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasLdsBranchVmemWARHazard;
}
+ // Has one cycle hazard on transcendental instruction feeding a
+ // non transcendental VALU.
+ bool hasTransForwardingHazard() const { return GFX940Insts; }
+
+ // Has one cycle hazard on a VALU instruction partially writing dst with
+ // a shift of result bits feeding another VALU instruction.
+ bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+
// Cannot use op_sel with v_dot instructions.
- bool hasDOTOpSelHazard() const {
+ bool hasDOTOpSelHazard() const { return GFX940Insts; }
+
+ // Does not have HW interlocs for VALU writing and then reading SGPRs.
+ bool hasVDecCoExecHazard() const {
return GFX940Insts;
}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir b/llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir
new file mode 100644
index 0000000000000..7f84babc72c21
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir
@@ -0,0 +1,217 @@
+# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: trans32_write_non_trans32_read
+# GCN: V_RCP_F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MUL_F32
+name: trans32_write_non_trans32_read
+body: |
+ bb.0:
+ $vgpr1 = V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: trans32_write_trans_read
+# GCN: V_SIN_F32
+# GCN-NEXT: V_COS_F32
+name: trans32_write_trans_read
+body: |
+ bb.0:
+ $vgpr0 = V_SIN_F32_e32 $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: trans64_write_non_trans_read
+# GCN: V_RCP_F64
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_OR_B32
+name: trans64_write_non_trans_read
+body: |
+ bb.0:
+ $vgpr0_vgpr1 = V_RCP_F64_e32 $vgpr2_vgpr3, implicit $mode, implicit $exec
+ $vgpr4 = V_OR_B32_e32 $vgpr1, $vgpr5, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: trans32_write_non_trans64_read
+# GCN: V_EXP_F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MUL_F64
+name: trans32_write_non_trans64_read
+body: |
+ bb.0:
+ $vgpr1 = V_EXP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+ $vgpr2_vgpr3 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: opsel_hi16_write_valu_read
+# GCN: V_ADD_I16
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MUL_F64
+name: opsel_hi16_write_valu_read
+body: |
+ bb.0:
+ $vgpr0 = V_ADD_I16_e64 8, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec
+ $vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: opsel_lo16_write_valu_read
+# GCN: V_ADD_I16
+# GCN-NEXT: V_MUL_F64
+name: opsel_lo16_write_valu_read
+body: |
+ bb.0:
+ $vgpr0 = V_ADD_I16_e64 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec
+ $vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_hi16_write_valu_read
+# GCN: V_MOV_B32_sdwa
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MOV_B32_e32
+name: sdwa_hi16_write_valu_read
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 5, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
+ $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_lo16_write_valu_read
+# GCN: V_MOV_B32_sdwa
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MOV_B32_e32
+name: sdwa_lo16_write_valu_read
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 4, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
+ $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_dword_write_valu_read
+# GCN: V_MOV_B32_sdwa
+# GCN-NEXT: V_MOV_B32_e32
+name: sdwa_dword_write_valu_read
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 6, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
+ $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: sdwa_lo16_no_write_valu_read
+# GCN: V_CMP_EQ_U32_sdwa
+# GCN-NEXT: V_MOV_B32_e32
+name: sdwa_lo16_no_write_valu_read
+body: |
+ bb.0:
+ $vcc = V_CMP_EQ_U32_sdwa 0, $vgpr1, 0, $vgpr0, 0, 5, 2, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_sgpr_valu_read_as_constant
+# GCN: V_READFIRSTLANE_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32_e32
+name: valu_write_sgpr_valu_read_as_constant
+body: |
+ bb.0:
+ $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_vcc_valu_read_as_constant
+# GCN: V_CMP_NE_U32_e32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_ADDC_U32_e32
+name: valu_write_vcc_valu_read_as_constant
+body: |
+ bb.0:
+ V_CMP_NE_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $exec
+ $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_sgpr_readlane_read_as_laneselect
+# GCN: V_READFIRSTLANE_B32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_READLANE_B32
+name: valu_write_sgpr_readlane_read_as_laneselect
+body: |
+ bb.0:
+ $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_sgpr_writelane_read_as_laneselect
+# GCN: V_ADD_CO_U32_e64
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_WRITELANE_B32
+name: valu_write_sgpr_writelane_read_as_laneselect
+body: |
+ bb.0:
+ $vgpr0, $sgpr0_sgpr1 = V_ADD_CO_U32_e64 $vgpr0, 1, 0, implicit $exec
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_valu_read_as_constant
+# GCN: V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MOV_B32_e32
+name: vcmpx_write_exec_valu_read_as_constant
+body: |
+ bb.0:
+ implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $exec_lo, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_readlane
+# GCN: V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_READLANE_B32
+name: vcmpx_write_exec_readlane
+body: |
+ bb.0:
+ implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+ $sgpr1 = V_READLANE_B32 $vgpr1, 0, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_readfirstlane
+# GCN: V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_READFIRSTLANE_B32
+name: vcmpx_write_exec_readfirstlane
+body: |
+ bb.0:
+ implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+ $sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+...
+
+# GCN-LABEL: name: vcmpx_write_exec_writelane
+# GCN: V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_WRITELANE_B32
+name: vcmpx_write_exec_writelane
+body: |
+ bb.0:
+ implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_vgpr_readlane_read
+# GCN: V_ADD_CO_U32_e32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_READLANE_B32
+name: valu_write_vgpr_readlane_read
+body: |
+ bb.0:
+ $vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
+ $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
+...
+
+# GCN-LABEL: name: valu_write_vgpr_readfirstlane_read
+# GCN: V_ADD_CO_U32_e32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_READFIRSTLANE_B32
+name: valu_write_vgpr_readfirstlane_read
+body: |
+ bb.0:
+ $vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
+ $sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+...
More information about the llvm-commits
mailing list