[llvm] 90081f3 - Revert "[Codegenprepare][X86] Use usub with overflow opt for IV increment"

Thu Feb 11 02:52:33 PST 2021

Author: Max Kazantsev
Date: 2021-02-11T17:52:11+07:00
New Revision: 90081f3020e38727eb30506d052cbb4e3a489eb6

URL: https://github.com/llvm/llvm-project/commit/90081f3020e38727eb30506d052cbb4e3a489eb6
DIFF: https://github.com/llvm/llvm-project/commit/90081f3020e38727eb30506d052cbb4e3a489eb6.diff

LOG: Revert "[Codegenprepare][X86] Use usub with overflow opt for IV increment"

This reverts commit 3d15b7e7dfc3e2cefc47791d1e8d95909e937842.

We've found an internal failure, need to analyze.

Added: 
    

Modified: 
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
    llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
    llvm/test/CodeGen/X86/usub_inc_iv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d62e1c119fe2..2b6dbde3ac58 100644

--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1284,29 +1284,7 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
                                                  Value *Arg0, Value *Arg1,
                                                  CmpInst *Cmp,
                                                  Intrinsic::ID IID) {
-  auto isIVIncrement = [this, &Cmp](BinaryOperator *BO) {
-    auto *PN = dyn_cast<PHINode>(BO->getOperand(0));
-    if (!PN)
-      return false;
-    const Loop *L = LI->getLoopFor(BO->getParent());
-    if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())
-      return false;
-    if (PN->getIncomingValueForBlock(L->getLoopLatch()) != BO)
-      return false;
-    if (auto *Step = dyn_cast<Instruction>(BO->getOperand(1)))
-      if (L->contains(Step->getParent()))
-        return false;
-    // IV increment may have other users than the IV. We do not want to make
-    // dominance queries to analyze the legality of moving it towards the cmp,
-    // so just check that there is no other users.
-    if (!BO->hasOneUse())
-      return false;
-    // Do not risk on moving increment into a child loop.
-    if (LI->getLoopFor(Cmp->getParent()) != L)
-      return false;
-    return true;
-  };
-  if (BO->getParent() != Cmp->getParent() && !isIVIncrement(BO)) {
+  if (BO->getParent() != Cmp->getParent()) {
     // We used to use a dominator tree here to allow multi-block optimization.
     // But that was problematic because:
     // 1. It could cause a perf regression by hoisting the math op into the
@@ -1317,16 +1295,9 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
     //    This is because we recompute the DT on every change in the main CGP
     //    run-loop. The recomputing is probably unnecessary in many cases, so if
     //    that was fixed, using a DT here would be ok.
-    //
-    // There is one important particular case we still want to handle: if BO is
-    // the IV increment. Important properties that make it profitable:
-    // - We can speculate IV increment anywhere in the loop (as long as the
-    //   indvar Phi is its only user);
-    // - Upon computing Cmp, we effectively compute something equivalent to the
-    //   IV increment (despite it loops 
diff erently in the IR). So moving it up
-    //   to the cmp point does not really increase register pressure.
     return false;
   }
+
   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
   if (BO->getOpcode() == Instruction::Add &&
       IID == Intrinsic::usub_with_overflow) {

diff  --git a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
index c59f4a96464d..fc0d0ae36720 100644
--- a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -89,16 +89,15 @@ failure:                                          ; preds = %backedge
 define i32 @test_02(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_02:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB2_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
-; CHECK-NEXT:    jb LBB2_4
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB2_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB2_1 Depth=1
 ; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    leaq -1(%rsi), %rsi
 ; CHECK-NEXT:    jne LBB2_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -133,16 +132,15 @@ failure:                                          ; preds = %backedge
 define i32 @test_03(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_03:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB3_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
-; CHECK-NEXT:    jb LBB3_4
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB3_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB3_1 Depth=1
 ; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    leaq -1(%rsi), %rsi
 ; CHECK-NEXT:    jne LBB3_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2

diff  --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
index cb37086459dd..6472c62fc57e 100644
--- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -16,11 +16,11 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movl (%rdx), %eax
 ; GENERIC-NEXT:    movl 4(%rdx), %ebx
 ; GENERIC-NEXT:    decl %ecx
-; GENERIC-NEXT:    leaq 20(%rdx), %r11
+; GENERIC-NEXT:    leaq 20(%rdx), %r14
 ; GENERIC-NEXT:    movq _Te0@{{.*}}(%rip), %r9
 ; GENERIC-NEXT:    movq _Te1@{{.*}}(%rip), %r8
 ; GENERIC-NEXT:    movq _Te3@{{.*}}(%rip), %r10
-; GENERIC-NEXT:    movq %rcx, %r14
+; GENERIC-NEXT:    movq %rcx, %r11
 ; GENERIC-NEXT:    .p2align 4, 0x90
 ; GENERIC-NEXT:  LBB0_1: ## %bb
 ; GENERIC-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -32,29 +32,30 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movzbl %bpl, %ebp
 ; GENERIC-NEXT:    movl (%r8,%rbp,4), %ebp
 ; GENERIC-NEXT:    xorl (%r9,%rax,4), %ebp
-; GENERIC-NEXT:    xorl -12(%r11), %ebp
+; GENERIC-NEXT:    xorl -12(%r14), %ebp
 ; GENERIC-NEXT:    shrl $24, %ebx
 ; GENERIC-NEXT:    movl (%r10,%rdi,4), %edi
 ; GENERIC-NEXT:    xorl (%r9,%rbx,4), %edi
-; GENERIC-NEXT:    xorl -8(%r11), %edi
+; GENERIC-NEXT:    xorl -8(%r14), %edi
 ; GENERIC-NEXT:    movl %ebp, %eax
 ; GENERIC-NEXT:    shrl $24, %eax
 ; GENERIC-NEXT:    movl (%r9,%rax,4), %eax
-; GENERIC-NEXT:    subq $1, %r14
-; GENERIC-NEXT:    jb LBB0_3
+; GENERIC-NEXT:    testq %r11, %r11
+; GENERIC-NEXT:    je LBB0_3
 ; GENERIC-NEXT:  ## %bb.2: ## %bb1
 ; GENERIC-NEXT:    ## in Loop: Header=BB0_1 Depth=1
 ; GENERIC-NEXT:    movl %edi, %ebx
 ; GENERIC-NEXT:    shrl $16, %ebx
 ; GENERIC-NEXT:    movzbl %bl, %ebx
 ; GENERIC-NEXT:    xorl (%r8,%rbx,4), %eax
-; GENERIC-NEXT:    xorl -4(%r11), %eax
+; GENERIC-NEXT:    xorl -4(%r14), %eax
 ; GENERIC-NEXT:    shrl $24, %edi
 ; GENERIC-NEXT:    movzbl %bpl, %ebx
 ; GENERIC-NEXT:    movl (%r10,%rbx,4), %ebx
 ; GENERIC-NEXT:    xorl (%r9,%rdi,4), %ebx
-; GENERIC-NEXT:    xorl (%r11), %ebx
-; GENERIC-NEXT:    addq $16, %r11
+; GENERIC-NEXT:    xorl (%r14), %ebx
+; GENERIC-NEXT:    decq %r11
+; GENERIC-NEXT:    addq $16, %r14
 ; GENERIC-NEXT:    jmp LBB0_1
 ; GENERIC-NEXT:  LBB0_3: ## %bb2
 ; GENERIC-NEXT:    shlq $4, %rcx
@@ -98,12 +99,12 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    ## kill: def $ecx killed $ecx def $rcx
 ; ATOM-NEXT:    movl (%rdx), %r15d
 ; ATOM-NEXT:    movl 4(%rdx), %eax
-; ATOM-NEXT:    leaq 20(%rdx), %r11
+; ATOM-NEXT:    leaq 20(%rdx), %r14
 ; ATOM-NEXT:    movq _Te0@{{.*}}(%rip), %r9
 ; ATOM-NEXT:    movq _Te1@{{.*}}(%rip), %r8
 ; ATOM-NEXT:    movq _Te3@{{.*}}(%rip), %r10
 ; ATOM-NEXT:    decl %ecx
-; ATOM-NEXT:    movq %rcx, %r14
+; ATOM-NEXT:    movq %rcx, %r11
 ; ATOM-NEXT:    .p2align 4, 0x90
 ; ATOM-NEXT:  LBB0_1: ## %bb
 ; ATOM-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -117,27 +118,28 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movzbl %r15b, %edi
 ; ATOM-NEXT:    xorl (%r9,%rbp,4), %ebx
 ; ATOM-NEXT:    movl (%r10,%rdi,4), %edi
-; ATOM-NEXT:    xorl -12(%r11), %ebx
+; ATOM-NEXT:    xorl -12(%r14), %ebx
 ; ATOM-NEXT:    xorl (%r9,%rax,4), %edi
 ; ATOM-NEXT:    movl %ebx, %eax
-; ATOM-NEXT:    xorl -8(%r11), %edi
+; ATOM-NEXT:    xorl -8(%r14), %edi
 ; ATOM-NEXT:    shrl $24, %eax
 ; ATOM-NEXT:    movl (%r9,%rax,4), %r15d
-; ATOM-NEXT:    subq $1, %r14
+; ATOM-NEXT:    testq %r11, %r11
 ; ATOM-NEXT:    movl %edi, %eax
-; ATOM-NEXT:    jb LBB0_3
+; ATOM-NEXT:    je LBB0_3
 ; ATOM-NEXT:  ## %bb.2: ## %bb1
 ; ATOM-NEXT:    ## in Loop: Header=BB0_1 Depth=1
 ; ATOM-NEXT:    shrl $16, %eax
 ; ATOM-NEXT:    shrl $24, %edi
-; ATOM-NEXT:    movzbl %al, %eax
-; ATOM-NEXT:    xorl (%r8,%rax,4), %r15d
+; ATOM-NEXT:    decq %r11
+; ATOM-NEXT:    movzbl %al, %ebp
 ; ATOM-NEXT:    movzbl %bl, %eax
 ; ATOM-NEXT:    movl (%r10,%rax,4), %eax
-; ATOM-NEXT:    xorl -4(%r11), %r15d
+; ATOM-NEXT:    xorl (%r8,%rbp,4), %r15d
 ; ATOM-NEXT:    xorl (%r9,%rdi,4), %eax
-; ATOM-NEXT:    xorl (%r11), %eax
-; ATOM-NEXT:    addq $16, %r11
+; ATOM-NEXT:    xorl -4(%r14), %r15d
+; ATOM-NEXT:    xorl (%r14), %eax
+; ATOM-NEXT:    addq $16, %r14
 ; ATOM-NEXT:    jmp LBB0_1
 ; ATOM-NEXT:  LBB0_3: ## %bb2
 ; ATOM-NEXT:    shrl $16, %eax

diff  --git a/llvm/test/CodeGen/X86/usub_inc_iv.ll b/llvm/test/CodeGen/X86/usub_inc_iv.ll
index 80f367a53479..7bf8b5e8b147 100644
--- a/llvm/test/CodeGen/X86/usub_inc_iv.ll
+++ b/llvm/test/CodeGen/X86/usub_inc_iv.ll
@@ -102,19 +102,18 @@ define i32 @test_02(i32* %p, i64 %len, i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[MATH:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[IV]], i64 1)
-; CHECK-NEXT:    [[MATH]] = extractvalue { i64, i1 } [[TMP0]], 0
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[COND_1:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT:    br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]]
 ; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
-; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 -1