[llvm] bc110de - [SelectionDAG] don't split branch on logic-of-vector-compares

Thu Jul 2 14:05:33 PDT 2020

Author: Sanjay Patel
Date: 2020-07-02T17:05:24-04:00
New Revision: bc110de78a4bf47f63267eae07ef02f14bcc78e3

URL: https://github.com/llvm/llvm-project/commit/bc110de78a4bf47f63267eae07ef02f14bcc78e3
DIFF: https://github.com/llvm/llvm-project/commit/bc110de78a4bf47f63267eae07ef02f14bcc78e3.diff

LOG: [SelectionDAG] don't split branch on logic-of-vector-compares

SelectionDAGBuilder converts logic-of-compares into multiple branches based
on a boolean TLI setting in isJumpExpensive(). But that probably never
considered the pattern of extracted bools from a vector compare - it seems
unlikely that we would want to turn vector logic into control-flow.

The motivating x86 reduction case is shown in PR44565:
https://bugs.llvm.org/show_bug.cgi?id=44565
...and that test shows the expected improvement from using pmovmsk codegen.

For AArch64, I modified the test to include an extra op because the simpler
test gets transformed by a codegen invocation of SimplifyCFG.

Differential Revision: https://reviews.llvm.org/D82602

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/test/CodeGen/AArch64/vec-extract-branch.ll
    llvm/test/CodeGen/X86/setcc-logic.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c7f1780770dc..1645a1f136bc 100644

--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2303,7 +2303,9 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
 
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
-  // As long as jumps are not expensive, this should improve performance.
+  // As long as jumps are not expensive (exceptions for multi-use logic ops,
+  // unpredictable branches, and vector extracts because those jumps are likely
+  // expensive for any target), this should improve performance.
   // For example, instead of something like:
   //     cmp A, B
   //     C = seteq
@@ -2318,9 +2320,12 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   //     jle foo
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
     Instruction::BinaryOps Opcode = BOp->getOpcode();
+    Value *Vec, *BOp0 = BOp->getOperand(0), *BOp1 = BOp->getOperand(1);
     if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
         !I.hasMetadata(LLVMContext::MD_unpredictable) &&
-        (Opcode == Instruction::And || Opcode == Instruction::Or)) {
+        (Opcode == Instruction::And || Opcode == Instruction::Or) &&
+        !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
+          match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
                            getEdgeProbability(BrMBB, Succ0MBB),

diff  --git a/llvm/test/CodeGen/AArch64/vec-extract-branch.ll b/llvm/test/CodeGen/AArch64/vec-extract-branch.ll
index 22f6c86a07fb..e05213d53421 100644
--- a/llvm/test/CodeGen/AArch64/vec-extract-branch.ll
+++ b/llvm/test/CodeGen/AArch64/vec-extract-branch.ll
@@ -6,16 +6,15 @@ define i32 @vec_extract_branch(<2 x double> %x, i32 %y)  {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmgt v0.2d, v0.2d, #0.0
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    tbz w8, #0, .LBB0_3
-; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    tbz w8, #0, .LBB0_3
-; CHECK-NEXT:  // %bb.2: // %true
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    and w8, w9, w8
+; CHECK-NEXT:    tbz w8, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %true
 ; CHECK-NEXT:    mov w8, #42
 ; CHECK-NEXT:    sdiv w0, w8, w0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_3: // %false
+; CHECK-NEXT:  .LBB0_2: // %false
 ; CHECK-NEXT:    mov w0, #88
 ; CHECK-NEXT:    ret
   %t1 = fcmp ogt <2 x double> %x, zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll
index a878a33d270c..59e0efc18c87 100644
--- a/llvm/test/CodeGen/X86/setcc-logic.ll
+++ b/llvm/test/CodeGen/X86/setcc-logic.ll
@@ -323,15 +323,12 @@ define i32 @vec_extract_branch(<2 x double> %x)  {
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    cmpltpd %xmm0, %xmm1
 ; CHECK-NEXT:    movmskpd %xmm1, %eax
-; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB16_3
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    shrb %al
-; CHECK-NEXT:    je .LBB16_3
-; CHECK-NEXT:  # %bb.2: # %true
+; CHECK-NEXT:    cmpb $3, %al
+; CHECK-NEXT:    jne .LBB16_2
+; CHECK-NEXT:  # %bb.1: # %true
 ; CHECK-NEXT:    movl $42, %eax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB16_3: # %false
+; CHECK-NEXT:  .LBB16_2: # %false
 ; CHECK-NEXT:    movl $88, %eax
 ; CHECK-NEXT:    retq
   %t1 = fcmp ogt <2 x double> %x, zeroinitializer