[llvm] r357634 - AMDGPU: Split block for si_end_cf

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 3 13:53:20 PDT 2019


Author: arsenm
Date: Wed Apr  3 13:53:20 2019
New Revision: 357634

URL: http://llvm.org/viewvc/llvm-project?rev=357634&view=rev
Log:
AMDGPU: Split block for si_end_cf

Relying on no spill or other code being inserted before this was
precarious. It relied on code diligently checking isBasicBlockPrologue
which is likely to be forgotten.

Ideally this could be done earlier, but this doesn't work because of
phis. Any other instruction can't be placed before them, so we have to
accept the position being incorrect during SSA.

This avoids regressions in the fast register allocator rewrite from
inverting the direction.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
    llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
    llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
    llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.mir
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Apr  3 13:53:20 2019
@@ -1214,6 +1214,12 @@ bool SIInstrInfo::expandPostRAPseudo(Mac
     MI.setDesc(get(AMDGPU::S_XOR_B64));
     break;
 
+  case AMDGPU::S_OR_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_OR_B64));
+    break;
+
   case AMDGPU::S_ANDN2_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -1698,6 +1704,7 @@ bool SIInstrInfo::analyzeBranch(MachineB
     case AMDGPU::SI_MASK_BRANCH:
     case AMDGPU::S_MOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
+    case AMDGPU::S_OR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
       break;
     case AMDGPU::SI_IF:

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Wed Apr  3 13:53:20 2019
@@ -189,6 +189,7 @@ class WrapTerminatorInst<SOP_Pseudo base
 }
 
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
+def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
 

Modified: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp Wed Apr  3 13:53:20 2019
@@ -55,6 +55,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -79,8 +80,11 @@ class SILowerControlFlow : public Machin
 private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
-  LiveIntervals *LIS = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  LiveIntervals *LIS = nullptr;
+  MachineDominatorTree *DT = nullptr;
+  MachineLoopInfo *MLI = nullptr;
+
 
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
@@ -111,7 +115,7 @@ public:
     AU.addPreservedID(LiveVariablesID);
     AU.addPreservedID(MachineLoopInfoID);
     AU.addPreservedID(MachineDominatorsID);
-    AU.setPreservesCFG();
+
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -388,23 +392,99 @@ void SILowerControlFlow::emitLoop(Machin
   MI.eraseFromParent();
 }
 
+// Insert \p Inst (which modifies exec) at \p InsPt in \p MBB, such that \p MBB
+// is split as necessary to keep the exec modification in its own block.
+static MachineBasicBlock *insertInstWithExecFallthrough(MachineBasicBlock &MBB,
+                                                        MachineInstr &MI,
+                                                        MachineInstr *NewMI,
+                                                        MachineDominatorTree *DT,
+                                                        LiveIntervals *LIS,
+                                                        MachineLoopInfo *MLI) {
+  assert(NewMI->isTerminator());
+
+  MachineBasicBlock::iterator InsPt = MI.getIterator();
+  if (std::next(MI.getIterator()) == MBB.end()) {
+    // Don't bother with a new block.
+    MBB.insert(InsPt, NewMI);
+    if (LIS)
+      LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    MI.eraseFromParent();
+    return &MBB;
+  }
+
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *SplitMBB
+    = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MachineFunction::iterator(MBB), SplitMBB);
+
+  // FIXME: This is working around a MachineDominatorTree API defect.
+  //
+  // If a previous pass split a critical edge, it may not have been applied to
+  // the DomTree yet. applySplitCriticalEdges is lazily applied, and inspects
+  // the CFG of the given block. Make sure to call a dominator tree method that
+  // will flush this cache before touching the successors of the block.
+  MachineDomTreeNode *NodeMBB = nullptr;
+  if (DT)
+    NodeMBB = DT->getNode(&MBB);
+
+  // Move everything to the new block, except the end_cf pseudo.
+  SplitMBB->splice(SplitMBB->begin(), &MBB, MBB.begin(), MBB.end());
+
+  SplitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(SplitMBB, BranchProbability::getOne());
+
+  MBB.insert(MBB.end(), NewMI);
+
+  if (DT) {
+    std::vector<MachineDomTreeNode *> Children = NodeMBB->getChildren();
+    DT->addNewBlock(SplitMBB, &MBB);
+
+    // Reparent all of the children to the new block body.
+    auto *SplitNode = DT->getNode(SplitMBB);
+    for (auto *Child : Children)
+      DT->changeImmediateDominator(Child, SplitNode);
+  }
+
+  if (MLI) {
+    if (MachineLoop *Loop = MLI->getLoopFor(&MBB))
+      Loop->addBasicBlockToLoop(SplitMBB, MLI->getBase());
+  }
+
+  if (LIS) {
+    LIS->insertMBBInMaps(SplitMBB);
+    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+  }
+
+  // All live-ins are forwarded.
+  for (auto &LiveIn : MBB.liveins())
+    SplitMBB->addLiveIn(LiveIn);
+
+  MI.eraseFromParent();
+  return SplitMBB;
+}
+
 void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
   MachineBasicBlock::iterator InsPt = MBB.begin();
-  MachineInstr *NewMI =
-      BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .add(MI.getOperand(0));
 
+  // First, move the instruction. It's unnecessarily difficult to update
+  // LiveIntervals when there's a change in control flow, so move the
+  // instruction before changing the blocks.
+  MBB.splice(InsPt, &MBB, MI.getIterator());
   if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    LIS->handleMove(MI);
 
-  MI.eraseFromParent();
+  MachineFunction *MF = MBB.getParent();
 
-  if (LIS)
-    LIS->handleMove(*NewMI);
+  // Create instruction without inserting it yet.
+  MachineInstr *NewMI
+    = BuildMI(*MF, DL, TII->get(AMDGPU::S_OR_B64_term), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .add(MI.getOperand(0));
+  insertInstWithExecFallthrough(MBB, MI, NewMI, DT, LIS, MLI);
 }
 
 // Returns replace operands for a logical operation, either single result
@@ -470,17 +550,20 @@ bool SILowerControlFlow::runOnMachineFun
 
   // This doesn't actually need LiveIntervals, but we can preserve them.
   LIS = getAnalysisIfAvailable<LiveIntervals>();
+  DT = getAnalysisIfAvailable<MachineDominatorTree>();
+  MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+
   MRI = &MF.getRegInfo();
 
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
-    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock *MBB = &*BI;
 
     MachineBasicBlock::iterator I, Next, Last;
 
-    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+    for (I = MBB->begin(), Last = MBB->end(); I != MBB->end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
@@ -501,10 +584,24 @@ bool SILowerControlFlow::runOnMachineFun
         emitLoop(MI);
         break;
 
-      case AMDGPU::SI_END_CF:
+      case AMDGPU::SI_END_CF: {
+        MachineInstr *NextMI = nullptr;
+
+        if (Next != MBB->end())
+          NextMI = &*Next;
+
         emitEndCf(MI);
-        break;
 
+        if (NextMI) {
+          MBB = NextMI->getParent();
+          Next = NextMI->getIterator();
+          Last = MBB->end();
+        }
+
+        NextBB = std::next(MBB->getIterator());
+        BE = MF.end();
+        break;
+      }
       case AMDGPU::S_AND_B64:
       case AMDGPU::S_OR_B64:
         // Cleanup bit manipulations on exec mask
@@ -518,7 +615,7 @@ bool SILowerControlFlow::runOnMachineFun
       }
 
       // Replay newly inserted code to combine masks
-      Next = (Last == MBB.end()) ? MBB.begin() : Last;
+      Next = (Last == MBB->end()) ? MBB->begin() : Last;
     }
   }
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp Wed Apr  3 13:53:20 2019
@@ -149,6 +149,12 @@ static bool removeTerminatorBit(const SI
     MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
     return true;
   }
+  case AMDGPU::S_OR_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_OR_B64));
+    return true;
+  }
   case AMDGPU::S_ANDN2_B64_term: {
     // This is only a terminator to get the correct spill code placement during
     // register allocation.

Modified: llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp Wed Apr  3 13:53:20 2019
@@ -83,7 +83,7 @@ FunctionPass *llvm::createSIOptimizeExec
 }
 
 static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
-  return MI.getOpcode() == AMDGPU::S_OR_B64 &&
+  return MI.getOpcode() == AMDGPU::S_OR_B64_term &&
          MI.modifiesRegister(AMDGPU::EXEC, TRI);
 }
 
@@ -362,7 +362,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMa
 
     // Try to collapse adjacent endifs.
     auto E = MBB.end();
-    auto Lead = skipDebugInstructionsForward(MBB.begin(), E);
+    auto Lead = MBB.getFirstTerminator();
     if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
       continue;
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.mir?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.mir Wed Apr  3 13:53:20 2019
@@ -49,8 +49,10 @@ body:             |
   ; GCN:   successors: %bb.4(0x80000000)
   ; GCN:   DBG_VALUE
   ; GCN: bb.4:
+  ; GCN:   successors: %bb.5(0x80000000)
   ; GCN:   DBG_VALUE
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.5:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
@@ -95,12 +97,14 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12, implicit-def $scc
     DBG_VALUE
+    $exec = S_OR_B64_term $exec, %12, implicit-def $scc
 
   bb.4:
     DBG_VALUE
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.5:
     %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -121,7 +125,7 @@ machineFunctionInfo:
 body:             |
   ; GCN-LABEL: name: simple_nested_if_empty_block_between
   ; GCN: bb.0:
-  ; GCN:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GCN:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
   ; GCN:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -129,7 +133,7 @@ body:             |
   ; GCN:   [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc
   ; GCN:   $exec = S_MOV_B64_term [[S_AND_B64_]]
-  ; GCN:   SI_MASK_BRANCH %bb.5, implicit $exec
+  ; GCN:   SI_MASK_BRANCH %bb.4, implicit $exec
   ; GCN:   S_BRANCH %bb.1
   ; GCN: bb.1:
   ; GCN:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -158,7 +162,9 @@ body:             |
   ; GCN: bb.4:
   ; GCN:   successors: %bb.5(0x80000000)
   ; GCN: bb.5:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.6:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
@@ -203,12 +209,14 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %12, implicit-def $scc
+
+  bb.4:
 
   bb.5:
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
 
-  bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+  bb.6:
     %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -229,7 +237,7 @@ machineFunctionInfo:
 body:             |
   ; GCN-LABEL: name: simple_nested_if_empty_block_dbg_between
   ; GCN: bb.0:
-  ; GCN:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GCN:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
   ; GCN:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -267,7 +275,9 @@ body:             |
   ; GCN:   successors: %bb.5(0x80000000)
   ; GCN:   DBG_VALUE
   ; GCN: bb.5:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.6:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
@@ -283,7 +293,7 @@ body:             |
     %3:sreg_64 = COPY $exec, implicit-def $exec
     %4:sreg_64 = S_AND_B64 %3, %2, implicit-def dead $scc
     $exec = S_MOV_B64_term %4
-    SI_MASK_BRANCH %bb.4, implicit $exec
+    SI_MASK_BRANCH %bb.5, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -312,13 +322,15 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %12, implicit-def $scc
 
-  bb.5:
+  bb.4:
     DBG_VALUE
 
-  bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+  bb.5:
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.6:
     %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -360,8 +372,7 @@ body:             |
   ; GCN:   %5.sub2:sgpr_128 = S_MOV_B32 0
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
-  ; GCN:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
-  ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; GCN:   $exec = S_MOV_B64_term [[S_AND_B64_1]]
   ; GCN:   SI_MASK_BRANCH %bb.3, implicit $exec
   ; GCN:   S_BRANCH %bb.2
@@ -376,9 +387,10 @@ body:             |
   ; GCN:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GCN:   dead %16:sgpr_32 = S_BREV_B32 [[DEF]]
   ; GCN:   KILL [[DEF]]
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN: bb.4:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   successors: %bb.5(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.5:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
@@ -426,10 +438,12 @@ body:             |
     %15:sgpr_32 = IMPLICIT_DEF
     %16:sgpr_32 = S_BREV_B32 %15
     KILL %15
-    $exec = S_OR_B64 $exec, %12, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %12, implicit-def $scc
 
   bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.5:
     %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -475,7 +489,7 @@ body:             |
   ; GCN:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
   ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; GCN:   $exec = S_MOV_B64_term [[S_AND_B64_1]]
-  ; GCN:   SI_MASK_BRANCH %bb.3, implicit $exec
+  ; GCN:   SI_MASK_BRANCH %bb.4, implicit $exec
   ; GCN:   S_BRANCH %bb.2
   ; GCN: bb.2:
   ; GCN:   successors: %bb.3(0x80000000)
@@ -485,12 +499,16 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
+  ; GCN: bb.4:
+  ; GCN:   successors: %bb.5(0x80000000)
   ; GCN:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GCN:   [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
   ; GCN:   KILL [[DEF]]
   ; GCN:   dead %17:sgpr_32 = COPY [[S_BREV_B32_]]
-  ; GCN: bb.4:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.5:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.6:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
@@ -525,7 +543,7 @@ body:             |
     %12:sreg_64 = COPY $exec, implicit-def $exec
     %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc
     $exec = S_MOV_B64_term %13
-    SI_MASK_BRANCH %bb.3, implicit $exec
+    SI_MASK_BRANCH %bb.4, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
@@ -535,14 +553,18 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %12, implicit-def $scc
+
+  bb.4:
     %15:sgpr_32 = IMPLICIT_DEF
     %16:sgpr_32 = S_BREV_B32 %15
     KILL %15
     %19:sgpr_32 = COPY %16
 
-  bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+  bb.5:
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.6:
     %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -598,10 +620,14 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
-  ; GCN:   dead %15:sreg_64 = S_BREV_B64 $exec
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc
   ; GCN: bb.4:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   successors: %bb.5(0x80000000)
+  ; GCN:   dead %15:sreg_64 = S_BREV_B64 $exec
+  ; GCN: bb.5:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.6:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
@@ -646,11 +672,15 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12,  implicit-def $scc
-    %15:sreg_64 = S_BREV_B64 $exec
+    $exec = S_OR_B64_term $exec, %12,  implicit-def $scc
 
   bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+    %15:sreg_64 = S_BREV_B64 $exec
+
+  bb.5:
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.6:
     %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -671,7 +701,7 @@ machineFunctionInfo:
 body:             |
   ; GCN-LABEL: name: copy_no_explicit_exec_dependency
   ; GCN: bb.0:
-  ; GCN:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+  ; GCN:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
   ; GCN:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -679,7 +709,7 @@ body:             |
   ; GCN:   [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc
   ; GCN:   $exec = S_MOV_B64_term [[S_AND_B64_]]
-  ; GCN:   SI_MASK_BRANCH %bb.4, implicit $exec
+  ; GCN:   SI_MASK_BRANCH %bb.5, implicit $exec
   ; GCN:   S_BRANCH %bb.1
   ; GCN: bb.1:
   ; GCN:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -706,17 +736,21 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
-  ; GCN:   dead %15:vgpr_32 = COPY %5.sub2
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc
   ; GCN: bb.4:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   successors: %bb.5(0x80000000)
+  ; GCN:   dead %15:vgpr_32 = COPY %5.sub2
+  ; GCN: bb.5:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.6:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
   ; GCN:   DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
   ; GCN:   S_ENDPGM 0
   bb.0:
-    successors: %bb.1, %bb.4
+    successors: %bb.1, %bb.5
     liveins: $vgpr0, $sgpr0_sgpr1
 
     %1:sgpr_64 = COPY $sgpr0_sgpr1
@@ -725,7 +759,7 @@ body:             |
     %3:sreg_64 = COPY $exec, implicit-def $exec
     %4:sreg_64 = S_AND_B64 %3, %2, implicit-def dead $scc
     $exec = S_MOV_B64_term %4
-    SI_MASK_BRANCH %bb.4, implicit $exec
+    SI_MASK_BRANCH %bb.5, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -754,11 +788,15 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12,  implicit-def $scc
-    %15:vgpr_32 = COPY %5.sub2
+    $exec = S_OR_B64_term $exec, %12,  implicit-def $scc
 
   bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+    %15:vgpr_32 = COPY %5.sub2
+
+  bb.5:
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.6:
     %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
@@ -813,17 +851,19 @@ body:             |
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
-  ; GCN:   successors: %bb.5(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
-  ; GCN:   S_BRANCH %bb.5
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc
+  ; GCN:   S_BRANCH %bb.6
   ; GCN: bb.4:
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+  ; GCN:   successors: %bb.5(0x80000000)
+  ; GCN:   $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc
+  ; GCN: bb.5:
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
   ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN:   $m0 = S_MOV_B32 -1
   ; GCN:   DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
   ; GCN:   S_ENDPGM 0
-  ; GCN: bb.5:
+  ; GCN: bb.6:
   ; GCN:   successors: %bb.4(0x80000000)
   ; GCN:   S_BRANCH %bb.4
   bb.0:
@@ -865,18 +905,20 @@ body:             |
     BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
-    $exec = S_OR_B64 $exec, %12, implicit-def $scc
-    S_BRANCH %bb.5
+    $exec = S_OR_B64_term $exec, %12, implicit-def $scc
+    S_BRANCH %bb.6
 
   bb.4:
-    $exec = S_OR_B64 $exec, %3, implicit-def $scc
+    $exec = S_OR_B64_term $exec, %3, implicit-def $scc
+
+  bb.5:
     %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
     %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     $m0 = S_MOV_B32 -1
     DS_WRITE_B32 %16, %15, 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     S_ENDPGM 0
 
-  bb.5:
+  bb.6:
     S_BRANCH %bb.4
 
 ...

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll?rev=357634&r1=357633&r2=357634&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll Wed Apr  3 13:53:20 2019
@@ -15,13 +15,14 @@ define amdgpu_cs float @ds_ordered_swap(
 }
 
 ; FUNC-LABEL: {{^}}ds_ordered_swap_conditional:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v1
 ; GCN: s_and_saveexec_b64 s[[SAVED:\[[0-9]+:[0-9]+\]]], vcc
 ; // We have to use s_cbranch, because ds_ordered_count has side effects with EXEC=0
 ; GCN: s_cbranch_execz [[BB:BB._.]]
 ; GCN: s_mov_b32 m0, s0
 ; VIGFX9-NEXT: s_nop 0
-; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds
+; GCN-NEXT: ds_ordered_count v0, v1 offset:4868 gds
 ; GCN-NEXT: [[BB]]:
 ; // Wait for expcnt(0) before modifying EXEC
 ; GCN-NEXT: s_waitcnt expcnt(0)




More information about the llvm-commits mailing list