[llvm] 6742261 - [AMDGPU] Apply pre-emit s_cbranch_vcc optimation to more patterns

Tue Jul 14 19:03:05 PDT 2020

Author: Carl Ritson
Date: 2020-07-15T11:02:35+09:00
New Revision: 674226126da6f08d97d383fca3b0c0e8c758d053

URL: https://github.com/llvm/llvm-project/commit/674226126da6f08d97d383fca3b0c0e8c758d053
DIFF: https://github.com/llvm/llvm-project/commit/674226126da6f08d97d383fca3b0c0e8c758d053.diff

LOG: [AMDGPU] Apply pre-emit s_cbranch_vcc optimation to more patterns

Add handling of s_andn2 and mask of 0.
This eliminates redundant instructions from uniform control flow.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D83641

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
    llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
    llvm/test/CodeGen/AMDGPU/infinite-loop.ll
    llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
    llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 1bb66907f9ce..f31c722db1b2 100644

--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -54,14 +54,14 @@ char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
 
 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
   // Match:
-  // sreg = -1
-  // vcc = S_AND_B64 exec, sreg
+  // sreg = -1 or 0
+  // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
   // S_CBRANCH_VCC[N]Z
   // =>
   // S_CBRANCH_EXEC[N]Z
   // We end up with this pattern sometimes after basic block placement.
-  // It happens while combining a block which assigns -1 to a saved mask and
-  // another block which consumes that saved mask and then a branch.
+  // It happens while combining a block which assigns -1 or 0 to a saved mask
+  // and another block which consumes that saved mask and then a branch.
   bool Changed = false;
   MachineBasicBlock &MBB = *MI.getParent();
   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
@@ -69,6 +69,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
   const unsigned CondReg = TRI->getVCC();
   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+  const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
 
   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
                                       E = MBB.rend();
@@ -80,7 +81,8 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
     if (A->modifiesRegister(ExecReg, TRI))
       return false;
     if (A->modifiesRegister(CondReg, TRI)) {
-      if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
+      if (!A->definesRegister(CondReg, TRI) ||
+          (A->getOpcode() != And && A->getOpcode() != AndN2))
         return false;
       break;
     }
@@ -97,9 +99,10 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
   }
   if (Op1.getReg() != ExecReg)
     return Changed;
-  if (Op2.isImm() && Op2.getImm() != -1)
+  if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
     return Changed;
 
+  int64_t MaskValue = 0;
   Register SReg;
   if (Op2.isReg()) {
     SReg = Op2.getReg();
@@ -113,28 +116,75 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
       ReadsSreg |= M->readsRegister(SReg, TRI);
     }
     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
-        M->getOperand(1).getImm() != -1)
+        (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
       return Changed;
-    // First if sreg is only used in and instruction fold the immediate
-    // into that and.
+    MaskValue = M->getOperand(1).getImm();
+    // First if sreg is only used in the AND instruction fold the immediate
+    // into into the AND.
     if (!ReadsSreg && Op2.isKill()) {
-      A->getOperand(2).ChangeToImmediate(-1);
+      A->getOperand(2).ChangeToImmediate(MaskValue);
       M->eraseFromParent();
     }
+  } else if (Op2.isImm()) {
+    MaskValue = Op2.getImm();
+  } else {
+    llvm_unreachable("Op2 must be register or immediate");
   }
 
+  // Invert mask for s_andn2
+  assert(MaskValue == 0 || MaskValue == -1);
+  if (A->getOpcode() == AndN2)
+    MaskValue = ~MaskValue;
+
   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
       MI.killsRegister(CondReg, TRI))
     A->eraseFromParent();
 
   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
   if (SReg == ExecReg) {
+    // EXEC is updated directly
     if (IsVCCZ) {
       MI.eraseFromParent();
       return true;
     }
     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
-  } else {
+  } else if (IsVCCZ && MaskValue == 0) {
+    // Will always branch
+    // Remove all succesors shadowed by new unconditional branch
+    MachineBasicBlock *Parent = MI.getParent();
+    SmallVector<MachineInstr *, 4> ToRemove;
+    bool Found = false;
+    for (MachineInstr &Term : Parent->terminators()) {
+      if (Found) {
+        if (Term.isBranch())
+          ToRemove.push_back(&Term);
+      } else {
+        Found = Term.isIdenticalTo(MI);
+      }
+    }
+    assert(Found && "conditional branch is not terminator");
+    for (auto BranchMI : ToRemove) {
+      MachineOperand &Dst = BranchMI->getOperand(0);
+      assert(Dst.isMBB() && "destination is not basic block");
+      Parent->removeSuccessor(Dst.getMBB());
+      BranchMI->eraseFromParent();
+    }
+
+    if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
+      Parent->removeSuccessor(Succ);
+    }
+
+    // Rewrite to unconditional branch
+    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+  } else if (!IsVCCZ && MaskValue == 0) {
+    // Will never branch
+    MachineOperand &Dst = MI.getOperand(0);
+    assert(Dst.isMBB() && "destination is not basic block");
+    MI.getParent()->removeSuccessor(Dst.getMBB());
+    MI.eraseFromParent();
+    return true;
+  } else if (MaskValue == -1) {
+    // Depends only on EXEC
     MI.setDesc(
         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 8c6b94da79cf..1125dbb75c56 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -482,13 +482,10 @@ ret:
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 ; GCN-NEXT: s_addc_u32
 ; GCN-NEXT: s_setpc_b64
-
 ; GCN-NEXT: [[LONG_BR_0]]:
-; GCN: s_setpc_b64
 
-; GCN: [[LONG_BR_DEST0]]
+; GCN: [[LONG_BR_DEST0]]:
 
-; GCN: s_cbranch_vccnz
 ; GCN-DAG: v_cmp_lt_i32
 ; GCN-DAG: v_cmp_ge_i32
 

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index b108e2637536..bca00f69e25c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -524,7 +524,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(
 
 ; GCN: {{^; %bb.[0-9]}}:
 ; GCN: s_mov_b64 exec,
-; GCN: s_cbranch_vccnz [[BB2]]
+; GCN: s_cbranch_execnz [[BB2]]
 
 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index b2acc37493e4..6d63ca5332e7 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -159,7 +159,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
 ; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:    s_cbranch_vccz BB3_2
+; SI-NEXT:    s_branch BB3_2
 ; SI-NEXT:  BB3_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
index 3011da138c76..ecfd59dfbcd0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
@@ -338,3 +338,80 @@ body:             |
     S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
     S_ENDPGM 0
 ...
+---
+# GCN-LABEL: name: andn2_execz_mov_vccz
+# GCN-NOT: S_MOV_
+# GCN-NOT: S_ANDN2_
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            andn2_execz_mov_vccz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 0
+    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM 0
+...
+---
+# GCN-LABEL: name: andn2_branch_mov_vccz
+# GCN-NOT: S_MOV_
+# GCN-NOT: S_ANDN2_
+# GCN: S_BRANCH %bb.1
+name:            andn2_branch_mov_vccz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM 0
+...
+---
+# GCN-LABEL: name: andn2_execnz_mov_vccnz
+# GCN-NOT: S_MOV_
+# GCN-NOT: S_ANDN2_
+# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
+name:            andn2_execnz_mov_vccnz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 0
+    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+    S_ENDPGM 0
+...
+---
+# GCN-LABEL: name: andn2_no_branch_mov_vccnz
+# GCN-NOT: S_MOV_
+# GCN-NOT: S_ANDN2_
+# GCN-NOT: S_CBRANCH
+# GCN-NOT: S_BRANCH
+name:            andn2_no_branch_mov_vccnz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index c53f2b07aa7c..15643d4b67f7 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1327,9 +1327,6 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; SI-NEXT:    s_cbranch_vccz BB26_3
 ; SI-NEXT:    s_branch BB26_4
 ; SI-NEXT:  BB26_2:
-; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; SI-NEXT:    s_cbranch_vccnz BB26_4
 ; SI-NEXT:  BB26_3: ; %if
 ; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; SI-NEXT:  BB26_4: ; %endif
@@ -1350,14 +1347,9 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; VI-NEXT:    s_cbranch_scc0 BB26_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_load_dword s1, s[6:7], 0x4
-; VI-NEXT:    s_mov_b64 s[2:3], 0
-; VI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; VI-NEXT:    s_cbranch_vccz BB26_3
+; VI-NEXT:    s_cbranch_execz BB26_3
 ; VI-NEXT:    s_branch BB26_4
 ; VI-NEXT:  BB26_2:
-; VI-NEXT:    s_mov_b64 s[2:3], -1
-; VI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; VI-NEXT:    s_cbranch_vccnz BB26_4
 ; VI-NEXT:  BB26_3: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s1, s[6:7], 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 144b3f2599bf..147d406a14f1 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -367,7 +367,6 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
 
 ; GCN: {{^}}[[FLOW]]:
-; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
 
 ; GCN: s_or_b64 exec, exec
 ; GCN: v_mov_b32_e32 v0, 2.0

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index a72af066a9c9..d040a04877e6 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -19,15 +19,10 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a,
 ; SI-NEXT:    s_cbranch_scc0 BB0_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_add_i32 s2, s7, s2
-; SI-NEXT:    s_mov_b64 s[8:9], 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT:    s_cbranch_vccz BB0_3
+; SI-NEXT:    s_cbranch_execz BB0_3
 ; SI-NEXT:    s_branch BB0_4
 ; SI-NEXT:  BB0_2:
-; SI-NEXT:    s_mov_b64 s[8:9], -1
 ; SI-NEXT:    ; implicit-def: $sgpr2
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT:    s_cbranch_vccnz BB0_4
 ; SI-NEXT:  BB0_3: ; %if
 ; SI-NEXT:    s_sub_i32 s2, s5, s6
 ; SI-NEXT:  BB0_4: ; %endif
@@ -69,15 +64,10 @@ define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x
 ; SI-NEXT:    s_load_dword s6, s[0:1], 0x37
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_i32 s3, s3, s6
-; SI-NEXT:    s_mov_b64 s[6:7], 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; SI-NEXT:    s_cbranch_vccz BB1_3
+; SI-NEXT:    s_cbranch_execz BB1_3
 ; SI-NEXT:    s_branch BB1_4
 ; SI-NEXT:  BB1_2:
-; SI-NEXT:    s_mov_b64 s[6:7], -1
 ; SI-NEXT:    ; implicit-def: $sgpr3
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; SI-NEXT:    s_cbranch_vccnz BB1_4
 ; SI-NEXT:  BB1_3: ; %if
 ; SI-NEXT:    s_load_dword s3, s[0:1], 0x1c
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x25

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 0b0b9a30f113..55557e51b82c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -668,7 +668,7 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d
 ; GCN-LABEL: {{^}}test_loop_vcc:
 ; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
 ; GFX1064: v_cmp_lt_f32_e32 vcc,
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_vccz
 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
 entry:
   br label %loop

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 167d8fa21ccb..127d0bc0fc68 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -652,13 +652,11 @@ main_body:
 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
 ; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
 
-; CHECK: ; %body
+; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
-; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop
+; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
 ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
-; CHECK: s_cbranch_vccz
-
-; CHECK: s_cbranch_vccnz [[LOOPHDR]]
+; CHECK: s_cbranch_vccz [[LOOPHDR]]
 
 ; CHECK: ; %break
 ; CHECK: ; return