[llvm] 13107c2 - [AMDGPU] Add support for GFX11 LDSDIR hazards

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 20 15:25:47 PDT 2022


Author: Jay Foad
Date: 2022-06-20T21:58:12+01:00
New Revision: 13107c2770dfdbb95ad07fa9235116fbf26e38f0

URL: https://github.com/llvm/llvm-project/commit/13107c2770dfdbb95ad07fa9235116fbf26e38f0
DIFF: https://github.com/llvm/llvm-project/commit/13107c2770dfdbb95ad07fa9235116fbf26e38f0.diff

LOG: [AMDGPU] Add support for GFX11 LDSDIR hazards

Detect LDS direct WAR/WAW hazards and compute values for
wait_vdst (va_vdst) parameter.  Where appropriate this
raises wait_vdst from the default 0 to allow concurrent
issue of LDS direct with VALU execution.

Also detect LDS direct versus VMEM source VGPR hazards
and insert vm_vsrc=0 waits using s_waitcnt_depctr.

Differential Revision: https://reviews.llvm.org/D127963

Added: 
    llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8adae23a43d5f..f16bbb99c4578 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -427,6 +427,7 @@ void GCNHazardRecognizer::RecedeCycle() {
 typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
 
 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
 
 // Search for a hazard in a block and its predecessors.
 template <typename StateT>
@@ -473,11 +474,11 @@ hasHazard(StateT State,
 // Returns a minimum wait states since \p I walking all predecessors.
 // Only scans until \p IsExpired does not return true.
 // Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
-                              const MachineBasicBlock *MBB,
-                              MachineBasicBlock::const_reverse_instr_iterator I,
-                              int WaitStates, IsExpiredFn IsExpired,
-                              DenseSet<const MachineBasicBlock *> &Visited) {
+static int getWaitStatesSince(
+    GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
+    MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
+    IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
+    GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
   for (auto E = MBB->instr_rend(); I != E; ++I) {
     // Don't add WaitStates for parent BUNDLE instructions.
     if (I->isBundle())
@@ -489,7 +490,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (I->isInlineAsm())
       continue;
 
-    WaitStates += SIInstrInfo::getNumWaitStates(*I);
+    WaitStates += GetNumWaitStates(*I);
 
     if (IsExpired(*I, WaitStates))
       return std::numeric_limits<int>::max();
@@ -500,8 +501,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (!Visited.insert(Pred).second)
       continue;
 
-    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
-                               WaitStates, IsExpired, Visited);
+    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
+                               IsExpired, Visited, GetNumWaitStates);
 
     MinWaitStates = std::min(MinWaitStates, W);
   }
@@ -1075,6 +1076,10 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixSMEMtoVectorWriteHazards(MI);
   fixVcmpxExecWARHazard(MI);
   fixLdsBranchVmemWARHazard(MI);
+  if (ST.hasLdsDirect()) {
+    fixLdsDirectVALUHazard(MI);
+    fixLdsDirectVMEMHazard(MI);
+  }
   fixVALUPartialForwardingHazard(MI);
   fixVALUTransUseHazard(MI);
 }
@@ -1366,6 +1371,81 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
   return true;
 }
 
+bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
+  if (!SIInstrInfo::isLDSDIR(*MI))
+    return false;
+
+  const int NoHazardWaitStates = 15;
+  const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+  const Register VDSTReg = VDST->getReg();
+
+  bool VisitedTrans = false;
+  auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
+    if (!SIInstrInfo::isVALU(I))
+      return false;
+    VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
+    // Cover both WAR and WAW
+    return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
+  };
+  auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
+    if (WaitStates >= NoHazardWaitStates)
+      return true;
+    // Instructions which cause va_vdst==0 expire hazard
+    return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+           SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
+  };
+  auto GetWaitStatesFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI) ? 1 : 0;
+  };
+
+  DenseSet<const MachineBasicBlock *> Visited;
+  auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
+                                    std::next(MI->getReverseIterator()), 0,
+                                    IsExpiredFn, Visited, GetWaitStatesFn);
+
+  // Transcendentals can execute in parallel to other VALUs.
+  // This makes va_vdst count unusable with a mixture of VALU and TRANS.
+  if (VisitedTrans)
+    Count = 0;
+
+  MachineOperand *WaitVdstOp =
+      TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
+  WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
+
+  return true;
+}
+
+bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
+  if (!SIInstrInfo::isLDSDIR(*MI))
+    return false;
+
+  const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+  const Register VDSTReg = VDST->getReg();
+
+  auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
+    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
+        !SIInstrInfo::isDS(I))
+      return false;
+    return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
+  };
+  auto IsExpiredFn = [](const MachineInstr &I, int) {
+    return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
+           (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
+           (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+            I.getOperand(0).getImm() == 0xffe3);
+  };
+
+  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+      std::numeric_limits<int>::max())
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0xffe3);
+
+  return true;
+}
+
 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
   if (!ST.isWave64())
     return false;

diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 1e12e0820f2ef..116e421e91722 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -96,6 +96,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
   bool fixVcmpxExecWARHazard(MachineInstr *MI);
   bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
+  bool fixLdsDirectVALUHazard(MachineInstr *MI);
+  bool fixLdsDirectVMEMHazard(MachineInstr *MI);
   bool fixVALUPartialForwardingHazard(MachineInstr *MI);
   bool fixVALUTransUseHazard(MachineInstr *MI);
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir
new file mode 100644
index 0000000000000..25311ef3f8b63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir
@@ -0,0 +1,409 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            lds_param_load_no_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_no_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_va_vdst0_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_va_vdst0_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_va_vdst0_war_salu
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_va_vdst0_war_salu
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $m0 = S_MOV_B32 killed $sgpr0
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $m0 = S_MOV_B32 killed $sgpr0
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_va_vdst1_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_va_vdst1_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 1, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_va_vdst10_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_va_vdst10_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 10, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_va_vdst10_waw
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_va_vdst10_waw
+    ; GCN: $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 10, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_va_vdst20_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_va_vdst20_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_valu_war_trans
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_valu_war_trans
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 4095
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_trans_war_valu
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_trans_war_valu
+    ; GCN: $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_valu_war_vmem
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_valu_war_vmem
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_valu_war_lds
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_valu_war_lds
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_valu_war_ldsdir
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_valu_war_ldsdir
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr10 = LDS_PARAM_LOAD 0, 1, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 4, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr10 = LDS_PARAM_LOAD 0, 1, 15, implicit $m0, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 4, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_vmem_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_vmem_war
+    ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65507
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_vmem_war_valu
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_vmem_war_valu
+    ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_vmem_war_exp
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_vmem_war_exp
+    ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_vmem_war_waitcnt
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt
+    ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: S_WAITCNT 0
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_WAITCNT 0
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_vmem_war_waitcnt_depctr
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt_depctr
+    ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65507
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_WAITCNT_DEPCTR 65507
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_param_load_vmem_war_waitcnt_depctr2
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt_depctr2
+    ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65535
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65507
+    ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_WAITCNT_DEPCTR 65535
+    $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_direct_load_no_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_direct_load_no_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_DIRECT_LOAD 15, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr1 = LDS_DIRECT_LOAD 0, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            lds_direct_load_va_vdst0_war
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: lds_direct_load_va_vdst0_war
+    ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr1 = LDS_DIRECT_LOAD 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr1 = LDS_DIRECT_LOAD 15, implicit $m0, implicit $exec
+    S_ENDPGM 0
+...


        


More information about the llvm-commits mailing list