[llvm] 360aff0 - [AMDGPU] Simplify nested SI_END_CF

Thu Mar 12 11:39:55 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-03-12T11:25:07-07:00
New Revision: 360aff0493e6d089c592e5954024795ead0c69d1

URL: https://github.com/llvm/llvm-project/commit/360aff0493e6d089c592e5954024795ead0c69d1
DIFF: https://github.com/llvm/llvm-project/commit/360aff0493e6d089c592e5954024795ead0c69d1.diff

LOG: [AMDGPU] Simplify nested SI_END_CF

This is to replace the optimization from the SIOptimizeExecMaskingPreRA.
We have less opportunities in the control flow lowering because many
VGPR copies are still in place and will be removed later, but we know
for sure an instruction is SI_END_CF and not just an arbitrary S_OR_B64
with EXEC.

The subsequent change needs to convert s_and_saveexec into s_and and
address new TODO lines in tests, then code block guarded by the
-amdgpu-remove-redundant-endcf option in the pre-RA exec mask optimizer
will be removed.

Differential Revision: https://reviews.llvm.org/D76033

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 06173551e92e..6e617df5f28f 100644

--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -51,6 +51,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -81,6 +82,7 @@ class SILowerControlFlow : public MachineFunctionPass {
   const SIInstrInfo *TII = nullptr;
   LiveIntervals *LIS = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  DenseSet<const MachineInstr*> LoweredEndCf;
 
   const TargetRegisterClass *BoolRC = nullptr;
   unsigned AndOpc;
@@ -103,6 +105,13 @@ class SILowerControlFlow : public MachineFunctionPass {
 
   void combineMasks(MachineInstr &MI);
 
+  // Skip to the next instruction, ignoring debug instructions, and trivial
+  // block boundaries (blocks that have one (typically fallthrough) successor,
+  // and the successor has one predecessor.
+  MachineBasicBlock::iterator
+  skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator It) const;
+
 public:
   static char ID;
 
@@ -396,6 +405,36 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+MachineBasicBlock::iterator
+SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
+  MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+
+  SmallSet<const MachineBasicBlock *, 4> Visited;
+  MachineBasicBlock *B = &MBB;
+  do {
+    if (!Visited.insert(B).second)
+      return MBB.end();
+
+    auto E = B->end();
+    for ( ; It != E; ++It) {
+      if (TII->mayReadEXEC(*MRI, *It))
+        break;
+    }
+
+    if (It != E)
+      return It;
+
+    if (B->succ_size() != 1)
+      return MBB.end();
+
+    // If there is one trivial successor, advance to the next block.
+    MachineBasicBlock *Succ = *B->succ_begin();
+
+    It = Succ->begin();
+    B = Succ;
+  } while (true);
+}
+
 void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -403,6 +442,18 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
   const DebugLoc &DL = MI.getDebugLoc();
 
+  // If the only instruction immediately following this END_CF is an another
+  // END_CF in the only successor we can avoid emitting exec mask restore here.
+  auto Next = skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator()));
+  if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF ||
+                            LoweredEndCf.count(&*Next))) {
+    LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(MI);
+    MI.eraseFromParent();
+    return;
+  }
+
   MachineBasicBlock::iterator InsPt =
       Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
                                : MBB.begin();
@@ -410,6 +461,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
                             .addReg(Exec)
                             .add(MI.getOperand(0));
 
+  LoweredEndCf.insert(NewMI);
+
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
 
@@ -556,5 +609,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  LoweredEndCf.clear();
+
   return true;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index bc1c36484bca..45dbf39505b7 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -1,10 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-opt-exec-mask-pre-ra=0 < %s | FileCheck -enable-var-scope -check-prefixes=DISABLED,ALL %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; ALL-LABEL: {{^}}simple_nested_if:
+; GCN-LABEL: {{^}}simple_nested_if:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]]
-; GCN:      s_and_b64 exec, exec, vcc
+
+; TODO: this does not need to save exec, just perform the and.
+; GCN:      s_and_saveexec_b64 s[{{[0-9:]+}}], vcc
+
 ; GCN-NEXT: s_cbranch_execz [[ENDIF]]
 ; GCN-NEXT: ; %bb.{{[0-9]+}}:
 ; GCN:      store_dword
@@ -13,9 +15,6 @@
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
 
-
-; DISABLED: s_or_b64 exec, exec
-; DISABLED: s_or_b64 exec, exec
 define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -39,7 +38,7 @@ bb.outer.end:                                     ; preds = %bb.outer.then, %bb.
   ret void
 }
 
-; ALL-LABEL: {{^}}uncollapsable_nested_if:
+; GCN-LABEL: {{^}}uncollapsable_nested_if:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
@@ -82,7 +81,7 @@ bb.outer.end:                                     ; preds = %bb.inner.then, %bb
   ret void
 }
 
-; ALL-LABEL: {{^}}nested_if_if_else:
+; GCN-LABEL: {{^}}nested_if_if_else:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
@@ -128,7 +127,7 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
   ret void
 }
 
-; ALL-LABEL: {{^}}nested_if_else_if:
+; GCN-LABEL: {{^}}nested_if_else_if:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
 ; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]]
 ; GCN-NEXT: s_cbranch_execz [[THEN_OUTER:BB[0-9_]+]]
@@ -151,9 +150,9 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
 ; GCN-NEXT: ; %bb.{{[0-9]+}}:
 ; GCN:      store_dword
 ; GCN-NEXT: [[FLOW1]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]]
-; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
+; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
+; GCN-NOT:  s_or_b64 exec
+; GCN-NOT:  {{^.*:}}
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
@@ -191,7 +190,7 @@ bb.outer.end:
   ret void
 }
 
-; ALL-LABEL: {{^}}s_endpgm_unsafe_barrier:
+; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]]
 ; GCN-NEXT: ; %bb.{{[0-9]+}}:
@@ -216,8 +215,7 @@ bb.end:                                           ; preds = %bb.then, %bb
   ret void
 }
 
-; Make sure scc liveness is updated if sor_b64 is removed
-; ALL-LABEL: {{^}}scc_liveness:
+; GCN-LABEL: {{^}}scc_liveness:
 
 ; GCN: %bb10
 ; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
@@ -229,7 +227,9 @@ bb.end:                                           ; preds = %bb.then, %bb
 ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
 
 ; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}}
+
+; TODO: this does not need to save exec, just perform the and.
+; GCN:      s_and_saveexec_b64 s[{{[0-9:]+}}], {{vcc|s\[[0-9:]+\]}}
 
 ; GCN-NOT: s_or_b64 exec, exec
 

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
index b9c93cc72468..a21ecdc6f975 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -46,7 +46,6 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN:   DBG_VALUE
   ; GCN: bb.4:
   ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
@@ -146,7 +145,6 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN: bb.4:
   ; GCN:   successors: %bb.5(0x80000000)
   ; GCN: bb.5:
@@ -246,7 +244,6 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN: bb.4:
   ; GCN:   successors: %bb.5(0x80000000)
   ; GCN:   DBG_VALUE
@@ -347,7 +344,6 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GCN:   [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
   ; GCN:   KILL [[DEF]]
@@ -450,7 +446,6 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GCN:   [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
   ; GCN:   KILL [[DEF]]
@@ -749,7 +744,6 @@ body:             |
   ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.5(0x80000000)
-  ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN:   S_BRANCH %bb.5
   ; GCN: bb.4:
   ; GCN:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 9f717df480fb..5f0b5aaecac9 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -58,7 +58,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz BB1_4
+; GFX9-NEXT:    s_cbranch_execz BB1_3
 ; GFX9-NEXT:  ; %bb.1: ; %bb19
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v6
@@ -100,9 +100,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; GFX9-NEXT:    s_cbranch_execnz BB1_2
-; GFX9-NEXT:  ; %bb.3: ; %Flow
-; GFX9-NEXT:    s_or_b64 exec, exec, s[12:13]
-; GFX9-NEXT:  BB1_4: ; %Flow3
+; GFX9-NEXT:  BB1_3: ; %Flow3
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]