[llvm] 12a3243 - [AMDGPU] Limit endcf-collapase to simple if

Tue Apr 7 10:27:33 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-04-07T10:27:23-07:00
New Revision: 12a324393d61a0ef602feeb9705ccb5b61563c27

URL: https://github.com/llvm/llvm-project/commit/12a324393d61a0ef602feeb9705ccb5b61563c27
DIFF: https://github.com/llvm/llvm-project/commit/12a324393d61a0ef602feeb9705ccb5b61563c27.diff

LOG: [AMDGPU] Limit endcf-collapase to simple if

We can only collapse adjacent SI_END_CF if outer statement
belongs to a simple SI_IF, otherwise correct mask is not in the
register we expect, but is an argument of an S_XOR instruction.

Even if SI_IF is simple it might be lowered using S_XOR because
lowering is dependent on a basic block layout. It is not
considered simple if instruction consuming its output is
not an SI_END_CF. Since that SI_END_CF might have already been
lowered to an S_OR isSimpleIf() check may return false.

This situation is an opportunity for a further optimization
of SI_IF lowering, but that is a separate optimization. In the
meanwhile move SI_END_CF post the lowering when we already know
how the rest of the CFG was lowered since a non-simple SI_IF
case still needs to be handled.

Differential Revision: https://reviews.llvm.org/D77610

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
    llvm/test/CodeGen/AMDGPU/collapse-endcf.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index caf21086cc37..1e90e6ba5418 100644

--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -51,6 +51,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -86,7 +87,7 @@ class SILowerControlFlow : public MachineFunctionPass {
   const SIInstrInfo *TII = nullptr;
   LiveIntervals *LIS = nullptr;
   MachineRegisterInfo *MRI = nullptr;
-  DenseSet<const MachineInstr*> LoweredEndCf;
+  SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
 
   const TargetRegisterClass *BoolRC = nullptr;
@@ -117,6 +118,9 @@ class SILowerControlFlow : public MachineFunctionPass {
   skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator It) const;
 
+  // Remove redundant SI_END_CF instructions.
+  void optimizeEndCf();
+
 public:
   static char ID;
 
@@ -448,29 +452,6 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
   const DebugLoc &DL = MI.getDebugLoc();
 
-  // If the only instruction immediately following this END_CF is an another
-  // END_CF in the only successor we can avoid emitting exec mask restore here.
-  if (RemoveRedundantEndcf) {
-    auto Next =
-      skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator()));
-    if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF ||
-                              LoweredEndCf.count(&*Next))) {
-      // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
-      // If that belongs to SI_ELSE then saved mask has an inverted value.
-      Register SavedExec = Next->getOperand(0).getReg();
-      const MachineInstr *Def = MRI.getUniqueVRegDef(SavedExec);
-      // A lowered SI_IF turns definition into COPY of exec.
-      if (Def && (Def->getOpcode() == AMDGPU::SI_IF ||
-                  LoweredIf.count(SavedExec))) {
-        LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
-        if (LIS)
-          LIS->RemoveMachineInstrFromMaps(MI);
-        MI.eraseFromParent();
-        return;
-      }
-    }
-  }
-
   MachineBasicBlock::iterator InsPt =
       Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
                                : MBB.begin();
@@ -544,6 +525,34 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
 }
 
+void SILowerControlFlow::optimizeEndCf() {
+  // If the only instruction immediately following this END_CF is an another
+  // END_CF in the only successor we can avoid emitting exec mask restore here.
+  if (!RemoveRedundantEndcf)
+    return;
+
+  for (MachineInstr *MI : LoweredEndCf) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    auto Next =
+      skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
+    if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
+      continue;
+    // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
+    // If that belongs to SI_ELSE then saved mask has an inverted value.
+    Register SavedExec
+      = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
+    assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
+
+    const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
+    if (Def && LoweredIf.count(SavedExec)) {
+      LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
+      if (LIS)
+        LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -626,6 +635,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  optimizeEndCf();
+
   LoweredEndCf.clear();
   LoweredIf.clear();
 

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
index 8bb1d134154d..815251e3560c 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -533,3 +533,63 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+---
+name: if_inside_loop
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GCN-LABEL: name: if_inside_loop
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   S_BRANCH %bb.6
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GCN:   [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+  ; GCN:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc
+  ; GCN:   $exec = S_MOV_B64_term killed [[S_AND_B64_]]
+  ; GCN:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   S_BRANCH %bb.6
+  ; GCN: bb.3:
+  ; GCN:   successors: %bb.4(0x80000000)
+  ; GCN:   $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc
+  ; GCN: bb.4:
+  ; GCN:   successors: %bb.5(0x80000000)
+  ; GCN:   $exec = S_OR_B64 $exec, %2, implicit-def $scc
+  ; GCN: bb.5:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN: bb.6:
+  ; GCN:   successors: %bb.4(0x40000000), %bb.0(0x40000000)
+  ; GCN:   [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+  ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc
+  ; GCN:   [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_1]], [[COPY1]], implicit-def dead $scc
+  ; GCN:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
+  ; GCN:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GCN:   S_BRANCH %bb.0
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    S_BRANCH %bb.6
+
+  bb.1:
+    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.2:
+    S_BRANCH %bb.6
+
+  bb.3:
+    SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.4:
+    SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.5:
+
+  bb.6:
+    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.0
+    S_ENDPGM 0
+
+...