[llvm] a35013b - [AMDGPU][GFX11] Mitigate VALU mask write hazard

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 1 00:49:26 PDT 2022


Author: Carl Ritson
Date: 2022-10-01T16:21:24+09:00
New Revision: a35013bec68db2cc3760c693dc4cb080f312396a

URL: https://github.com/llvm/llvm-project/commit/a35013bec68db2cc3760c693dc4cb080f312396a
DIFF: https://github.com/llvm/llvm-project/commit/a35013bec68db2cc3760c693dc4cb080f312396a.diff

LOG: [AMDGPU][GFX11] Mitigate VALU mask write hazard

VALU use of an SGPR (pair) as mask followed by SALU write to the
same SGPR can cause incorrect execution of subsequent SALU reads
of the SGPR.

Reviewed By: foad, rampitec

Differential Revision: https://reviews.llvm.org/D134151

Added: 
    llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
    llvm/lib/Target/AMDGPU/GCNSubtarget.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c015936cb6d8c..9f504f6d21563 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1102,6 +1102,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixVALUTransUseHazard(MI);
   fixWMMAHazards(MI);
   fixShift64HighRegBug(MI);
+  fixVALUMaskWriteHazard(MI);
 }
 
 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -2709,3 +2710,140 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
 
   return false;
 }
+
+bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
+  if (!ST.isWave64())
+    return false;
+  if (!ST.hasVALUMaskWriteHazard())
+    return false;
+  if (!SIInstrInfo::isSALU(*MI))
+    return false;
+
+  // The hazard sequence is three instructions:
+  //   1. VALU reads SGPR as mask
+  //   2. SALU writes SGPR
+  //   3. SALU reads SGPR
+  // The hazard can expire if the distance between 2 and 3 is sufficient.
+  // In practice this happens <10% of the time, hence this always assumes
+  // the hazard exists if 1 and 2 are present to avoid searching.
+
+  const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
+  if (!SDSTOp || !SDSTOp->isReg())
+    return false;
+
+  const Register HazardReg = SDSTOp->getReg();
+  if (HazardReg == AMDGPU::EXEC ||
+      HazardReg == AMDGPU::EXEC_LO ||
+      HazardReg == AMDGPU::EXEC_HI ||
+      HazardReg == AMDGPU::M0)
+    return false;
+
+  auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
+    switch (I.getOpcode()) {
+    case AMDGPU::V_ADDC_U32_e32:
+    case AMDGPU::V_ADDC_U32_dpp:
+    case AMDGPU::V_CNDMASK_B16_e32:
+    case AMDGPU::V_CNDMASK_B16_dpp:
+    case AMDGPU::V_CNDMASK_B32_e32:
+    case AMDGPU::V_CNDMASK_B32_dpp:
+    case AMDGPU::V_DIV_FMAS_F32_e64:
+    case AMDGPU::V_DIV_FMAS_F64_e64:
+    case AMDGPU::V_SUBB_U32_e32:
+    case AMDGPU::V_SUBB_U32_dpp:
+    case AMDGPU::V_SUBBREV_U32_e32:
+    case AMDGPU::V_SUBBREV_U32_dpp:
+      // These implicitly read VCC as mask source.
+      return HazardReg == AMDGPU::VCC ||
+             HazardReg == AMDGPU::VCC_LO ||
+             HazardReg == AMDGPU::VCC_HI;
+    case AMDGPU::V_ADDC_U32_e64:
+    case AMDGPU::V_ADDC_U32_e64_dpp:
+    case AMDGPU::V_CNDMASK_B16_e64:
+    case AMDGPU::V_CNDMASK_B16_e64_dpp:
+    case AMDGPU::V_CNDMASK_B32_e64:
+    case AMDGPU::V_CNDMASK_B32_e64_dpp:
+    case AMDGPU::V_SUBB_U32_e64:
+    case AMDGPU::V_SUBB_U32_e64_dpp:
+    case AMDGPU::V_SUBBREV_U32_e64:
+    case AMDGPU::V_SUBBREV_U32_e64_dpp: {
+      // Only check mask register overlaps.
+      const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
+      assert(SSRCOp);
+      return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
+    }
+    default:
+      return false;
+    }
+  };
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
+    // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
+    if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+        !(I.getOperand(0).getImm() & 0x1))
+      return true;
+
+    // VALU access to any SGPR or literal constant other than HazardReg
+    // mitigates hazard. No need to check HazardReg here as this will
+    // only be called when !IsHazardFn.
+    if (!SIInstrInfo::isVALU(I))
+      return false;
+    for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
+      const MachineOperand &Op = I.getOperand(OpNo);
+      if (Op.isReg()) {
+        Register OpReg = Op.getReg();
+        // Only consider uses
+        if (!Op.isUse())
+          continue;
+        // Ignore EXEC
+        if (OpReg == AMDGPU::EXEC ||
+            OpReg == AMDGPU::EXEC_LO ||
+            OpReg == AMDGPU::EXEC_HI)
+          continue;
+        // Ignore all implicit uses except VCC
+        if (Op.isImplicit()) {
+          if (OpReg == AMDGPU::VCC ||
+              OpReg == AMDGPU::VCC_LO ||
+              OpReg == AMDGPU::VCC_HI)
+            return true;
+          continue;
+        }
+        if (TRI.isSGPRReg(MRI, OpReg))
+          return true;
+      } else {
+        const MCInstrDesc &InstDesc = I.getDesc();
+        const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
+        if (TII.isLiteralConstant(Op, OpInfo))
+          return true;
+      }
+    }
+    return false;
+  };
+
+  // Check for hazard
+  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+      std::numeric_limits<int>::max())
+    return false;
+
+  auto NextMI = std::next(MI->getIterator());
+
+  // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
+  BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+    .addImm(0xfffe);
+
+  // SALU write may be s_getpc in a bundle.
+  if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
+    // Update offsets of any references in the bundle.
+    while (NextMI != MI->getParent()->end() &&
+           NextMI->isBundledWithPred()) {
+      for (auto &Operand : NextMI->operands()) {
+        if (Operand.isGlobal())
+          Operand.setOffset(Operand.getOffset() + 4);
+      }
+      NextMI++;
+    }
+  }
+
+  return true;
+}

diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index ac19b04fdbc74..3ccca527c626b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -106,6 +106,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixVALUTransUseHazard(MachineInstr *MI);
   bool fixWMMAHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
+  bool fixVALUMaskWriteHazard(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4eb1cf8300f47..8079490b29ca0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1058,6 +1058,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
 
+  bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
+
   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return GFX90AInsts; }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
new file mode 100644
index 0000000000000..f6203859a91be
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -0,0 +1,560 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
+
+  define amdgpu_gs void @mask_hazard_getpc1() { ret void }
+  define amdgpu_gs void @mask_hazard_getpc2() { ret void }
+  define amdgpu_gs void @mask_hazard_vcc1() { ret void }
+  define amdgpu_gs void @mask_hazard_vcc2() { ret void }
+  define amdgpu_gs void @mask_hazard_cndmask_dpp1() { ret void }
+  define amdgpu_gs void @mask_hazard_cndmask_dpp2() { ret void }
+  define amdgpu_gs void @mask_hazard_cndmask_dpp3() { ret void }
+  define amdgpu_gs void @mask_hazard_cndmask_dpp4() { ret void }
+  define amdgpu_gs void @mask_hazard_addc1() { ret void }
+  define amdgpu_gs void @mask_hazard_addc2() { ret void }
+  define amdgpu_gs void @mask_hazard_addc3() { ret void }
+  define amdgpu_gs void @mask_hazard_addc4() { ret void }
+  define amdgpu_gs void @mask_hazard_subb1() { ret void }
+  define amdgpu_gs void @mask_hazard_subb2() { ret void }
+  define amdgpu_gs void @mask_hazard_subb3() { ret void }
+  define amdgpu_gs void @mask_hazard_subb4() { ret void }
+  define amdgpu_gs void @mask_hazard_subbrev1() { ret void }
+  define amdgpu_gs void @mask_hazard_subbrev2() { ret void }
+  define amdgpu_gs void @mask_hazard_subbrev3() { ret void }
+  define amdgpu_gs void @mask_hazard_subbrev4() { ret void }
+  define amdgpu_gs void @mask_hazard_div_fmas_f32() { ret void }
+  define amdgpu_gs void @mask_hazard_div_fmas_f64() { ret void }
+  define amdgpu_gs void @mask_hazard_subreg1() { ret void }
+  define amdgpu_gs void @mask_hazard_subreg2() { ret void }
+  define amdgpu_gs void @mask_hazard_subreg3() { ret void }
+  define amdgpu_gs void @mask_hazard_subreg4() { ret void }
+  define amdgpu_gs void @mask_hazard_subreg5() { ret void }
+  define amdgpu_gs void @mask_hazard_waitcnt() { ret void }
+  define amdgpu_gs void @mask_hazard_gap1() { ret void }
+  define amdgpu_gs void @mask_hazard_gap2() { ret void }
+  define amdgpu_gs void @mask_hazard_gap3() { ret void }
+  define amdgpu_gs void @mask_hazard_no_hazard1() { ret void }
+  define amdgpu_gs void @mask_hazard_no_hazard2() { ret void }
+  define amdgpu_gs void @mask_hazard_no_hazard3() { ret void }
+...
+
+---
+name:            mask_hazard_getpc1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_getpc1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_getpc2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_getpc2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+    ; GCN-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: }
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    BUNDLE implicit-def $sgpr0_sgpr1 {
+      $sgpr0_sgpr1 = S_GETPC_B64
+      $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
+      $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc
+    }
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_vcc1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_vcc1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_vcc2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_vcc2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_cndmask_dpp1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_cndmask_dpp1
+    ; GCN: $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_cndmask_dpp2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_cndmask_dpp2
+    ; GCN: $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_cndmask_dpp3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_cndmask_dpp3
+    ; GCN: $vgpr0 = V_CNDMASK_B16_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_CNDMASK_B16_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_cndmask_dpp4
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_cndmask_dpp4
+    ; GCN: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_addc1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_addc1
+    ; GCN: $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_addc2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_addc2
+    ; GCN: $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_addc3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_addc3
+    ; GCN: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_addc4
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_addc4
+    ; GCN: $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subb1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subb1
+    ; GCN: $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subb2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subb2
+    ; GCN: $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subb3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subb3
+    ; GCN: $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subb4
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subb4
+    ; GCN: $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subbrev1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subbrev1
+    ; GCN: $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subbrev2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subbrev2
+    ; GCN: $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subbrev3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subbrev3
+    ; GCN: $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_subbrev4
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subbrev4
+    ; GCN: $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
+    $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_div_fmas_f32
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_div_fmas_f32
+    ; GCN: $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+---
+name:            mask_hazard_div_fmas_f64
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_div_fmas_f64
+    ; GCN: $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+    $vcc = S_CSELECT_B64 -1, 0, implicit $scc
+    S_ENDPGM 0
+...
+
+# Check low word overlap
+---
+name:            mask_hazard_subreg1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subreg1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    $sgpr2 = S_MOV_B32 0
+    S_ENDPGM 0
+...
+
+# Check high word overlap
+---
+name:            mask_hazard_subreg2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subreg2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    $sgpr3 = S_MOV_B32 0
+    S_ENDPGM 0
+...
+
+# Check multiple subreg overlap
+---
+name:            mask_hazard_subreg3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subreg3
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    $sgpr2 = S_MOV_B32 0
+    $sgpr3 = S_MOV_B32 0
+    S_ENDPGM 0
+...
+
+# Check vcc_lo overlap
+---
+name:            mask_hazard_subreg4
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subreg4
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc_lo = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $vcc_lo = S_MOV_B32 0
+    $sgpr2 = S_MOV_B32 $vcc_lo
+    S_ENDPGM 0
+...
+
+# Check vcc_hi overlap
+---
+name:            mask_hazard_subreg5
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_subreg5
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc_hi = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $vcc_hi = S_MOV_B32 0
+    $sgpr2 = S_MOV_B32 $vcc_hi
+    S_ENDPGM 0
+...
+
+# S_WAITCNT does not mitigate hazard
+---
+name:            mask_hazard_waitcnt
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_waitcnt
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: S_WAITCNT 0
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    S_WAITCNT 0
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# Check implicit $exec
+---
+name:            mask_hazard_gap1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_gap1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# Check implicit $mode
+---
+name:            mask_hazard_gap2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_gap2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# Check explicit $exec
+---
+name:            mask_hazard_gap3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_gap3
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# Different SGPR write
+---
+name:            mask_hazard_no_hazard1
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_no_hazard1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    S_ENDPGM 0
+...
+
+# Different SGPR write with mask read overlap
+---
+name:            mask_hazard_no_hazard2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_no_hazard2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $vcc
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $sgpr0_sgpr1 = S_MOV_B64 $vcc
+    S_ENDPGM 0
+...
+
+# Overlapping VGPR write
+---
+name:            mask_hazard_no_hazard3
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: mask_hazard_no_hazard3
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    S_ENDPGM 0
+...


        


More information about the llvm-commits mailing list