[llvm] r318557 - AMDGPU: Move hazard avoidance out of waitcnt pass.

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 17 13:35:32 PST 2017


Author: arsenm
Date: Fri Nov 17 13:35:32 2017
New Revision: 318557

URL: http://llvm.org/viewvc/llvm-project?rev=318557&view=rev
Log:
AMDGPU: Move hazard avoidance out of waitcnt pass.

This is mostly moving VMEM clause breaking into
the hazard recognizer. Also move another hazard
currently handled in the waitcnt pass.

Also stops breaking clauses unless xnack is enabled.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
    llvm/trunk/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=318557&r1=318556&r2=318557&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Fri Nov 17 13:35:32 2017
@@ -806,10 +806,14 @@ public:
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasReadM0Hazard() const {
+  bool hasReadM0MovRelInterpHazard() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
+  bool hasReadM0SendMsgHazard() const {
+    return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  }
+
   unsigned getKernArgSegmentSize(const MachineFunction &MF,
                                  unsigned ExplictArgBytes) const;
 

Modified: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp?rev=318557&r1=318556&r2=318557&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp Fri Nov 17 13:35:32 2017
@@ -87,6 +87,18 @@ static bool isSMovRel(unsigned Opcode) {
   }
 }
 
+static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::S_SENDMSG:
+  case AMDGPU::S_SENDMSGHALT:
+  case AMDGPU::S_TTRACEDATA:
+    return true;
+  default:
+    // TODO: GDS
+    return false;
+  }
+}
+
 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
@@ -100,7 +112,10 @@ GCNHazardRecognizer::getHazardType(SUnit
   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
     return NoopHazard;
 
-  if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+  // FIXME: Should flat be considered vmem?
+  if ((SIInstrInfo::isVMEM(*MI) ||
+       SIInstrInfo::isFLAT(*MI))
+      && checkVMEMHazards(MI) > 0)
     return NoopHazard;
 
   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
@@ -124,7 +139,12 @@ GCNHazardRecognizer::getHazardType(SUnit
   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
     return NoopHazard;
 
-  if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+  if (ST.hasReadM0MovRelInterpHazard() &&
+      (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+      checkReadM0Hazards(MI) > 0)
+    return NoopHazard;
+
+  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) &&
       checkReadM0Hazards(MI) > 0)
     return NoopHazard;
 
@@ -144,26 +164,20 @@ unsigned GCNHazardRecognizer::PreEmitNoo
   if (SIInstrInfo::isSMRD(*MI))
     return std::max(WaitStates, checkSMRDHazards(MI));
 
-  if (SIInstrInfo::isVALU(*MI)) {
-      WaitStates = std::max(WaitStates, checkVALUHazards(MI));
+  if (SIInstrInfo::isVALU(*MI))
+    WaitStates = std::max(WaitStates, checkVALUHazards(MI));
 
-    if (SIInstrInfo::isVMEM(*MI))
-      WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+  if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+    WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
 
-    if (SIInstrInfo::isDPP(*MI))
-      WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+  if (SIInstrInfo::isDPP(*MI))
+    WaitStates = std::max(WaitStates, checkDPPHazards(MI));
 
-    if (isDivFMas(MI->getOpcode()))
-      WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+  if (isDivFMas(MI->getOpcode()))
+    WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
 
-    if (isRWLane(MI->getOpcode()))
-      WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
-
-    if (TII.isVINTRP(*MI))
-      WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
-
-    return WaitStates;
-  }
+  if (isRWLane(MI->getOpcode()))
+    WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
   if (isSGetReg(MI->getOpcode()))
     return std::max(WaitStates, checkGetRegHazards(MI));
@@ -174,7 +188,11 @@ unsigned GCNHazardRecognizer::PreEmitNoo
   if (isRFE(MI->getOpcode()))
     return std::max(WaitStates, checkRFEHazards(MI));
 
-  if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))
+  if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
+                                           isSMovRel(MI->getOpcode())))
+    return std::max(WaitStates, checkReadM0Hazards(MI));
+
+  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI))
     return std::max(WaitStates, checkReadM0Hazards(MI));
 
   return WaitStates;
@@ -282,12 +300,14 @@ void GCNHazardRecognizer::addClauseInst(
   addRegsToSet(TRI, MI.uses(), ClauseUses);
 }
 
-int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
   // SMEM soft clause are only present on VI+, and only matter if xnack is
   // enabled.
   if (!ST.isXNACKEnabled())
     return 0;
 
+  bool IsSMRD = TII.isSMRD(*MEM);
+
   resetClause();
 
   // A soft-clause is any group of consecutive SMEM instructions.  The
@@ -303,7 +323,10 @@ int GCNHazardRecognizer::checkSMEMSoftCl
   for (MachineInstr *MI : EmittedInstrs) {
     // When we hit a non-SMEM instruction then we have passed the start of the
     // clause and we can stop.
-    if (!MI || !SIInstrInfo::isSMRD(*MI))
+    if (!MI)
+      break;
+
+    if (IsSMRD != SIInstrInfo::isSMRD(*MI))
       break;
 
     addClauseInst(*MI);
@@ -312,13 +335,13 @@ int GCNHazardRecognizer::checkSMEMSoftCl
   if (ClauseDefs.none())
     return 0;
 
-  // FIXME: When we support stores, we need to make sure not to put loads and
-  // stores in the same clause if they use the same address.  For now, just
-  // start a new clause whenever we see a store.
-  if (SMEM->mayStore())
+  // We need to make sure not to put loads and stores in the same clause if they
+  // use the same address. For now, just start a new clause whenever we see a
+  // store.
+  if (MEM->mayStore())
     return 1;
 
-  addClauseInst(*SMEM);
+  addClauseInst(*MEM);
 
   // If the set of defs and uses intersect then we cannot add this instruction
   // to the clause, so we have a hazard.
@@ -329,7 +352,7 @@ int GCNHazardRecognizer::checkSMRDHazard
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   int WaitStatesNeeded = 0;
 
-  WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+  WaitStatesNeeded = checkSoftClauseHazards(SMRD);
 
   // This SMRD hazard only affects SI.
   if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
@@ -369,18 +392,15 @@ int GCNHazardRecognizer::checkSMRDHazard
 }
 
 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
   if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
     return 0;
 
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
 
   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
   // SGPR was written by a VALU Instruction.
-  int VmemSgprWaitStates = 5;
-  int WaitStatesNeeded = 0;
-  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+  const int VmemSgprWaitStates = 5;
+  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 
   for (const MachineOperand &Use : VMEM->uses()) {
     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
@@ -598,11 +618,8 @@ int GCNHazardRecognizer::checkAnyInstHaz
 }
 
 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
-  if (!ST.hasReadM0Hazard())
-    return 0;
-
   const SIInstrInfo *TII = ST.getInstrInfo();
-  int SMovRelWaitStates = 1;
+  const int SMovRelWaitStates = 1;
   auto IsHazardFn = [TII] (MachineInstr *MI) {
     return TII->isSALU(*MI);
   };

Modified: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h?rev=318557&r1=318556&r2=318557&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h Fri Nov 17 13:35:32 2017
@@ -58,7 +58,7 @@ class GCNHazardRecognizer final : public
                                 [](MachineInstr *) { return true; });
   int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
 
-  int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+  int checkSoftClauseHazards(MachineInstr *SMEM);
   int checkSMRDHazards(MachineInstr *SMRD);
   int checkVMEMHazards(MachineInstr* VMEM);
   int checkDPPHazards(MachineInstr *DPP);

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=318557&r1=318556&r2=318557&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Fri Nov 17 13:35:32 2017
@@ -1522,8 +1522,6 @@ void SIInsertWaitcnts::insertWaitcntInBl
     ScoreBrackets->dump();
   });
 
-  bool InsertNOP = false;
-
   // Walk over the instructions.
   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
        Iter != E;) {
@@ -1624,58 +1622,6 @@ void SIInsertWaitcnts::insertWaitcntInBl
       VCCZBugHandledSet.insert(&Inst);
     }
 
-    if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-
-      // This avoids a s_nop after a waitcnt has just been inserted.
-      if (!SWaitInst && InsertNOP) {
-        BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
-      }
-      InsertNOP = false;
-
-      // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
-      // or SMEM clause, respectively.
-      //
-      // The temporary workaround is to break the clauses with S_NOP.
-      //
-      // The proper solution would be to allocate registers such that all source
-      // and destination registers don't overlap, e.g. this is illegal:
-      //   r0 = load r2
-      //   r2 = load r0
-      bool IsSMEM = false;
-      bool IsVMEM = false;
-      if (TII->isSMRD(Inst))
-        IsSMEM = true;
-      else if (TII->usesVM_CNT(Inst))
-        IsVMEM = true;
-
-      ++Iter;
-      if (Iter == E)
-        break;
-
-      MachineInstr &Next = *Iter;
-
-      // TODO: How about consecutive SMEM instructions?
-      //       The comments above says break the clause but the code does not.
-      // if ((TII->isSMRD(next) && isSMEM) ||
-      if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
-          // TODO: Enable this check when hasSoftClause is upstreamed.
-          // ST->hasSoftClauses() &&
-          ST->isXNACKEnabled()) {
-        // Insert a NOP to break the clause.
-        InsertNOP = true;
-        continue;
-      }
-
-      // There must be "S_NOP 0" between an instruction writing M0 and
-      // S_SENDMSG.
-      if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
-           Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
-          Inst.definesRegister(AMDGPU::M0))
-        InsertNOP = true;
-
-      continue;
-    }
-
     ++Iter;
   }
 

Added: llvm/trunk/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir?rev=318557&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir Fri Nov 17 13:35:32 2017
@@ -0,0 +1,580 @@
+# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x1
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_clause_load_flat4_x1
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x2
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_clause_load_flat4_x2
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x3
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_clause_load_flat4_x3
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_clause_load_flat4_x4
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Reuse of same input pointer is OK
+
+name: trivial_clause_load_flat4_x2_sameptr
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_clause_load_flat4_x2_sameptr
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: flat_load4_overwrite_ptr_lo
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: flat_load4_overwrite_ptr_lo
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: flat_load4_overwrite_ptr_hi
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: flat_load4_overwrite_ptr_hi
+    ; GCN: %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# 64-bit load clobbers its own ptr reg
+name: flat_load8_overwrite_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: flat_load8_overwrite_ptr
+    ; GCN: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt
+# breaks the clause.
+
+
+name: break_clause_at_max_clause_size_flat_load4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4
+    ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18
+    S_ENDPGM
+...
+---
+
+name: break_clause_simple_load_flat4_lo_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_simple_load_flat4_lo_ptr
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+name: break_clause_simple_load_flat4_hi_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_simple_load_flat4_hi_ptr
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+name: break_clause_simple_load_flat8_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_simple_load_flat8_ptr
+    ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+
+name: break_clause_simple_load_flat16_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_simple_load_flat16_ptr
+    ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+    %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+# The clause is broken by the waitcnt inserted at the end of the
+# block, so no nop is needed.
+
+
+name: break_clause_block_boundary_load_flat8_ptr
+
+body: |
+  ; GCN-LABEL: name: break_clause_block_boundary_load_flat8_ptr
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN:   %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+  ; GCN: bb.1:
+  ; XNACK-NEXT:  S_NOP 0
+  ; GCN-NEXT:   %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+  ; GCN-NEXT:   S_ENDPGM
+
+  bb.0:
+    %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+  bb.1:
+    %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# The load clobbers the pointer of the store, so it needs to break.
+
+name: break_clause_store_load_into_ptr_flat4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_store_load_into_ptr_flat4
+    ; GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# The load clobbers the data of the store, so it needs to break.
+# FIXME: Would it be better to s_nop and wait later?
+
+name: break_clause_store_load_into_data_flat4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_store_load_into_data_flat4
+    ; GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Regular VALU instruction breaks clause, no nop needed
+
+name: valu_inst_breaks_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: valu_inst_breaks_clause
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Regular SALU instruction breaks clause, no nop needed
+
+name: salu_inst_breaks_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: salu_inst_breaks_clause
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %sgpr8 = S_MOV_B32 0
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %sgpr8 = S_MOV_B32 0
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+name: ds_inst_breaks_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: ds_inst_breaks_clause
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+name: smrd_inst_breaks_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: smrd_inst_breaks_clause
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# FIXME: Should this be handled?
+name: implicit_use_breaks_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: implicit_use_breaks_clause
+    ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5
+    %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+name: trivial_clause_load_mubuf4_x2
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2
+    ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+name: break_clause_simple_load_mubuf_offen_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr
+    ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+# BUFFER instructions overwriting their own inputs is supposedly OK.
+
+name: mubuf_load4_overwrite_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: mubuf_load4_overwrite_ptr
+    ; GCN: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: %vgpr1 = V_MOV_B32_e32 0, implicit %exec
+    ; GCN-NEXT: %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+    %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %vgpr1 = V_MOV_B32_e32 0, implicit %exec
+    %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec
+    S_ENDPGM
+...
+---
+# Break a clause from interference between mubuf and flat instructions
+
+name: break_clause_flat_load_mubuf_load
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_flat_load_mubuf_load
+    ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+# Break a clause from interference between mubuf and flat instructions
+
+# GCN-LABEL: name: break_clause_mubuf_load_flat_load
+# GCN: bb.0:
+# GCN-NEXT: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4
+# XNACK-NEXT: S_NOP 0
+# GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3
+# GCN-NEXT: S_ENDPGM
+name: break_clause_mubuf_load_flat_load
+
+body: |
+  bb.0:
+    %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    S_ENDPGM
+...
+---
+
+name: break_clause_atomic_rtn_into_ptr_flat4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_flat4
+    ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+name: break_clause_atomic_nortn_ptr_load_flat4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_flat4
+    ; GCN: FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: S_ENDPGM
+
+    FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+
+name: break_clause_atomic_rtn_into_ptr_mubuf4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4
+    ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+
+    %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+
+name: break_clause_atomic_nortn_ptr_load_mubuf4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4
+    ; GCN: BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+    ; GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+
+    BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+    %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+# Make sure there is no assert on mubuf instructions which do not have
+# vaddr, and don't add register to track.
+name: no_break_clause_mubuf_load_novaddr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr
+    ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    ; GCN-NEXT: S_ENDPGM
+    %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+# Loads and stores using different addresses theoretically does not
+# need a nop
+name: mix_load_store_clause
+body: |
+  bb.0:
+    ; GCN-LABEL: name: mix_load_store_clause
+    ; GCN: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# Loads and stores using the same address needs a nop.
+
+name: mix_load_store_clause_same_address
+body: |
+  bb.0:
+    ; GCN-LABEL: name: mix_load_store_clause_same_address
+    ; GCN: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...

Modified: llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir?rev=318557&r1=318556&r2=318557&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir Fri Nov 17 13:35:32 2017
@@ -437,22 +437,22 @@ body: |
 
 # GCN-LABEL: bb.0:
 # GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
 # GCN-NEXT: V_INTERP_P1_F32
 
 # GCN-LABEL: bb.1:
 # GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
 # GCN-NEXT: V_INTERP_P2_F32
 
 # GCN-LABEL: bb.2:
 # GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
 # GCN-NEXT: V_INTERP_P1_F32_16bank
 
 # GCN-LABEL: bb.3:
 # GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
 # GCN-NEXT: V_INTERP_MOV_F32
 
 name: v_interp

Added: llvm/trunk/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir?rev=318557&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir Fri Nov 17 13:35:32 2017
@@ -0,0 +1,49 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,VI %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,CI %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,SI %s
+
+---
+name: m0_sendmsg
+body: |
+  ; GCN-LABEL: name: m0_sendmsg
+  ; GCN:  %m0 = S_MOV_B32 -1
+  ; VI-NEXT: S_NOP 0
+  ; GFX9-NEXT: S_NOP 0
+  ; GCN-NEXT: S_SENDMSG 3, implicit %exec, implicit %m0
+
+  bb.0:
+    %m0 = S_MOV_B32 -1
+    S_SENDMSG 3, implicit %exec, implicit %m0
+    S_ENDPGM
+...
+---
+
+name: m0_sendmsghalt
+body: |
+  ; GCN-LABEL: name: m0_sendmsghalt
+  ; GCN:  %m0 = S_MOV_B32 -1
+  ; VI-NEXT: S_NOP 0
+  ; GFX9-NEXT: S_NOP 0
+  ; GCN-NEXT: S_SENDMSGHALT 3, implicit %exec, implicit %m0
+
+  bb.0:
+    %m0 = S_MOV_B32 -1
+    S_SENDMSGHALT 3, implicit %exec, implicit %m0
+    S_ENDPGM
+...
+---
+
+name: m0_ttracedata
+body: |
+  ; GCN-LABEL: name: m0_ttracedata
+  ; GCN:  %m0 = S_MOV_B32 -1
+  ; VI-NEXT: S_NOP 0
+  ; GFX9-NEXT: S_NOP 0
+  ; GCN-NEXT: S_TTRACEDATA implicit %m0
+
+  bb.0:
+    %m0 = S_MOV_B32 -1
+    S_TTRACEDATA implicit %m0
+    S_ENDPGM
+...




More information about the llvm-commits mailing list