[llvm] 50b8634 - [X86] Improve optimizeCompareInstr for signed comparisons after BMI/TBM instructions

Wed Mar 31 09:46:01 PDT 2021

Author: Craig Topper
Date: 2021-03-31T09:45:29-07:00
New Revision: 50b8634a99b6f2f36a3fdbea7aa7892c9b881d64

URL: https://github.com/llvm/llvm-project/commit/50b8634a99b6f2f36a3fdbea7aa7892c9b881d64
DIFF: https://github.com/llvm/llvm-project/commit/50b8634a99b6f2f36a3fdbea7aa7892c9b881d64.diff

LOG: [X86] Improve optimizeCompareInstr for signed comparisons after BMI/TBM instructions

We previously couldn't optimize out a TEST if the branch/setcc/cmov
used the overflow flag. This patches allows the TEST to be removed
if the flag producing instruction is known to clear the OF flag.
Thats what the TEST instruction would have done so that should be
equivalent.

Need to add test cases. I'll try to get back to this if I have bandwidth.

Fixes PR48768.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D94856

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/test/CodeGen/X86/bmi.ll
    llvm/test/CodeGen/X86/tbm_patterns.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 9c2970c324a3..5d34912b7c0a 100644

--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3972,8 +3972,10 @@ inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
 
 /// Check whether the definition can be converted
 /// to remove a comparison against zero.
-inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
+inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
+                                    bool &ClearsOverflowFlag) {
   NoSignFlag = false;
+  ClearsOverflowFlag = false;
 
   switch (MI.getOpcode()) {
   default: return false;
@@ -4039,12 +4041,6 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
   case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
   case X86::ANDN32rr:  case X86::ANDN32rm:
   case X86::ANDN64rr:  case X86::ANDN64rm:
-  case X86::BLSI32rr:  case X86::BLSI32rm:
-  case X86::BLSI64rr:  case X86::BLSI64rm:
-  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
-  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
-  case X86::BLSR32rr:  case X86::BLSR32rm:
-  case X86::BLSR64rr:  case X86::BLSR64rm:
   case X86::BZHI32rr:  case X86::BZHI32rm:
   case X86::BZHI64rr:  case X86::BZHI64rm:
   case X86::LZCNT16rr: case X86::LZCNT16rm:
@@ -4056,6 +4052,13 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
   case X86::TZCNT16rr: case X86::TZCNT16rm:
   case X86::TZCNT32rr: case X86::TZCNT32rm:
   case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return true;
+  case X86::BLSI32rr:    case X86::BLSI32rm:
+  case X86::BLSI64rr:    case X86::BLSI64rm:
+  case X86::BLSMSK32rr:  case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:  case X86::BLSMSK64rm:
+  case X86::BLSR32rr:    case X86::BLSR32rm:
+  case X86::BLSR64rr:    case X86::BLSR64rm:
   case X86::BLCFILL32rr: case X86::BLCFILL32rm:
   case X86::BLCFILL64rr: case X86::BLCFILL64rm:
   case X86::BLCI32rr:    case X86::BLCI32rm:
@@ -4074,12 +4077,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
   case X86::T1MSKC64rr:  case X86::T1MSKC64rm:
   case X86::TZMSK32rr:   case X86::TZMSK32rm:
   case X86::TZMSK64rr:   case X86::TZMSK64rm:
+    // These instructions clear the overflow flag just like TEST.
+    // FIXME: These are not the only instructions in this switch that clear the
+    // overflow flag.
+    ClearsOverflowFlag = true;
     return true;
   case X86::BEXTR32rr:   case X86::BEXTR64rr:
   case X86::BEXTR32rm:   case X86::BEXTR64rm:
   case X86::BEXTRI32ri:  case X86::BEXTRI32mi:
   case X86::BEXTRI64ri:  case X86::BEXTRI64mi:
-    // BEXTR doesn't update the sign flag so we can't use it.
+    // BEXTR doesn't update the sign flag so we can't use it. It does clear
+    // the overflow flag, but that's not useful without the sign flag.
     NoSignFlag = true;
     return true;
   }
@@ -4199,8 +4207,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   // right way.
   bool ShouldUpdateCC = false;
   bool NoSignFlag = false;
+  bool ClearsOverflowFlag = false;
   X86::CondCode NewCC = X86::COND_INVALID;
-  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag)) {
+  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
     // Scan forward from the use until we hit the use we're looking for or the
     // compare instruction.
     for (MachineBasicBlock::iterator J = MI;; ++J) {
@@ -4312,11 +4321,15 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       default: break;
       case X86::COND_A: case X86::COND_AE:
       case X86::COND_B: case X86::COND_BE:
+        // CF is used, we can't perform this optimization.
+        return false;
       case X86::COND_G: case X86::COND_GE:
       case X86::COND_L: case X86::COND_LE:
       case X86::COND_O: case X86::COND_NO:
-        // CF and OF are used, we can't perform this optimization.
-        return false;
+        // If OF is used, the instruction needs to clear it like CmpZero does.
+        if (!ClearsOverflowFlag)
+          return false;
+        break;
       case X86::COND_S: case X86::COND_NS:
         // If SF is used, but the instruction doesn't update the SF, then we
         // can't do the optimization.

diff  --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll
index 641b03ea92f6..1522d27dcec9 100644
--- a/llvm/test/CodeGen/X86/bmi.ll
+++ b/llvm/test/CodeGen/X86/bmi.ll
@@ -539,11 +539,12 @@ define i32 @blsi32_z2(i32 %a, i32 %b, i32 %c) nounwind {
   ret i32 %t3
 }
 
+; Inspired by PR48768, but using cmovcc instead of setcc. There should be
+; no test instruction.
 define i32 @blsi32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; X86-LABEL: blsi32_sle:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmovlel %eax, %ecx
@@ -554,7 +555,6 @@ define i32 @blsi32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    blsil %edi, %ecx
-; X64-NEXT:    testl %ecx, %ecx
 ; X64-NEXT:    cmovgl %edx, %eax
 ; X64-NEXT:    retq
   %t0 = sub i32 0, %a
@@ -685,7 +685,6 @@ define i64 @blsi64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    blsiq %rdi, %rcx
-; X64-NEXT:    testq %rcx, %rcx
 ; X64-NEXT:    cmovgq %rdx, %rax
 ; X64-NEXT:    retq
   %t0 = sub i64 0, %a
@@ -776,7 +775,6 @@ define i32 @blsmsk32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; X86-LABEL: blsmsk32_sle:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmovlel %eax, %ecx
@@ -787,7 +785,6 @@ define i32 @blsmsk32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    blsmskl %edi, %ecx
-; X64-NEXT:    testl %ecx, %ecx
 ; X64-NEXT:    cmovgl %edx, %eax
 ; X64-NEXT:    retq
   %t0 = sub i32 %a, 1
@@ -918,7 +915,6 @@ define i64 @blsmsk64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    blsmskq %rdi, %rcx
-; X64-NEXT:    testq %rcx, %rcx
 ; X64-NEXT:    cmovgq %rdx, %rax
 ; X64-NEXT:    retq
   %t0 = sub i64 %a, 1
@@ -1009,7 +1005,6 @@ define i32 @blsr32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; X86-LABEL: blsr32_sle:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmovlel %eax, %ecx
@@ -1020,7 +1015,6 @@ define i32 @blsr32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    blsrl %edi, %ecx
-; X64-NEXT:    testl %ecx, %ecx
 ; X64-NEXT:    cmovgl %edx, %eax
 ; X64-NEXT:    retq
   %t0 = sub i32 %a, 1
@@ -1151,7 +1145,6 @@ define i64 @blsr64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    blsrq %rdi, %rcx
-; X64-NEXT:    testq %rcx, %rcx
 ; X64-NEXT:    cmovgq %rdx, %rax
 ; X64-NEXT:    retq
   %t0 = sub i64 %a, 1

diff  --git a/llvm/test/CodeGen/X86/tbm_patterns.ll b/llvm/test/CodeGen/X86/tbm_patterns.ll
index 5f5306a722b1..30e9a2638385 100644
--- a/llvm/test/CodeGen/X86/tbm_patterns.ll
+++ b/llvm/test/CodeGen/X86/tbm_patterns.ll
@@ -193,7 +193,6 @@ define i32 @test_x86_tbm_blcfill_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blcfilll %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, 1
@@ -245,7 +244,6 @@ define i64 @test_x86_tbm_blcfill_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blcfillq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, 1
@@ -300,7 +298,6 @@ define i32 @test_x86_tbm_blci_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blcil %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 1, %a
@@ -356,7 +353,6 @@ define i64 @test_x86_tbm_blci_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blciq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 1, %a
@@ -432,7 +428,6 @@ define i32 @test_x86_tbm_blcic_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blcicl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
@@ -488,7 +483,6 @@ define i64 @test_x86_tbm_blcic_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blcicq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
@@ -541,7 +535,6 @@ define i32 @test_x86_tbm_blcmsk_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blcmskl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, 1
@@ -593,7 +586,6 @@ define i64 @test_x86_tbm_blcmsk_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blcmskq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, 1
@@ -645,7 +637,6 @@ define i32 @test_x86_tbm_blcs_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blcsl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, 1
@@ -697,7 +688,6 @@ define i64 @test_x86_tbm_blcs_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blcsq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, 1
@@ -749,7 +739,6 @@ define i32 @test_x86_tbm_blsfill_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blsfilll %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, -1
@@ -801,7 +790,6 @@ define i64 @test_x86_tbm_blsfill_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blsfillq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, -1
@@ -856,7 +844,6 @@ define i32 @test_x86_tbm_blsic_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    blsicl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
@@ -912,7 +899,6 @@ define i64 @test_x86_tbm_blsic_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    blsicq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
@@ -968,7 +954,6 @@ define i32 @test_x86_tbm_t1mskc_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    t1mskcl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
@@ -1024,7 +1009,6 @@ define i64 @test_x86_tbm_t1mskc_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    t1mskcq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
@@ -1080,7 +1064,6 @@ define i32 @test_x86_tbm_tzmsk_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    tzmskl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
@@ -1136,7 +1119,6 @@ define i64 @test_x86_tbm_tzmsk_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    tzmskq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    cmovgq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1