[llvm] [CodeGenPrepare] Unfold slow ctpop when used in power-of-two test (PR #102731)
Sergei Barannikov via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 10 00:36:59 PDT 2024
https://github.com/s-barannikov created https://github.com/llvm/llvm-project/pull/102731
DAG combiner already does this transformation, but in some cases it does
not have a chance because either CodeGenPrepare or SelectionDAGBuilder
move icmp to a different basic block.
Fixes #94829
>From 1106bffe79184d0f2ac4ee9223dd0a032aaa84c9 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Sat, 10 Aug 2024 09:53:14 +0300
Subject: [PATCH 1/2] Precommit test
---
llvm/test/CodeGen/RISCV/pr101786.ll | 60 ++++++++++++
llvm/test/CodeGen/X86/pr94829.ll | 49 ++++++++++
.../CodeGenPrepare/unfold-pow2-test.ll | 97 +++++++++++++++++++
3 files changed, 206 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/pr101786.ll
create mode 100644 llvm/test/CodeGen/X86/pr94829.ll
create mode 100644 llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll
new file mode 100644
index 00000000000000..afac9e18da1eea
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr101786.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=riscv64 -o - %s | FileCheck %s
+
+define i64 @test(i64 %x, ptr %p) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: bgtz a2, .LBB0_3
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: srli a3, a2, 1
+; CHECK-NEXT: lui a4, 349525
+; CHECK-NEXT: addiw a4, a4, 1365
+; CHECK-NEXT: slli a5, a4, 32
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: lui a3, 209715
+; CHECK-NEXT: addiw a3, a3, 819
+; CHECK-NEXT: slli a4, a3, 32
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: and a4, a2, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: srli a3, a2, 4
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: lui a3, 61681
+; CHECK-NEXT: addiw a3, a3, -241
+; CHECK-NEXT: slli a4, a3, 32
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 8
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 16
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 32
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: srli a2, a2, 56
+; CHECK-NEXT: li a3, 1
+; CHECK-NEXT: bltu a3, a2, .LBB0_3
+; CHECK-NEXT: # %bb.2: # %if.else
+; CHECK-NEXT: ld a0, 0(a1)
+; CHECK-NEXT: .LBB0_3: # %if.end
+; CHECK-NEXT: ret
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %x, 0
+ %or = or i1 %cmp2, %cmp1
+ br i1 %or, label %if.end, label %if.else
+
+if.else:
+ %load = load i64, ptr %p, align 8
+ br label %if.end
+
+if.end:
+ %res = phi i64 [0, %entry], [%load, %if.else]
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr94829.ll b/llvm/test/CodeGen/X86/pr94829.ll
new file mode 100644
index 00000000000000..3fc5db79074101
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr94829.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=x86_64 -o - %s | FileCheck %s
+
+define ptr @test(i64 %x) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %rax, %rdi
+; CHECK-NEXT: addq %rcx, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: addq %rdi, %rax
+; CHECK-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; CHECK-NEXT: imulq %rcx, %rax
+; CHECK-NEXT: shrq $56, %rax
+; CHECK-NEXT: cmpq $2, %rax
+; CHECK-NEXT: jb .LBB0_2
+; CHECK-NEXT: # %bb.1: # %if.else
+; CHECK-NEXT: cmpl $2, %eax
+; CHECK-NEXT: .LBB0_2: # %exit1
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+entry:
+ %ctpop = tail call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp ult i64 %ctpop, 2
+ br i1 %cmp, label %exit1, label %if.else
+
+if.else:
+ br i1 %cmp, label %exit2, label %exit3
+
+exit1:
+ ret ptr null
+
+exit2:
+ ret ptr null
+
+exit3:
+ ret ptr null
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
new file mode 100644
index 00000000000000..dd9f51cec95240
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S %s \
+; RUN: | FileCheck %s --check-prefix=SLOW
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S --mattr=+zbb %s \
+; RUN: | FileCheck %s --check-prefix=FAST
+; REQUIRES: riscv64-registered-target
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64"
+
+define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
+; SLOW-LABEL: define i64 @test_ult_2(
+; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; SLOW-NEXT: [[ENTRY:.*]]:
+; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; SLOW: [[IF_THEN]]:
+; SLOW-NEXT: br label %[[IF_END]]
+; SLOW: [[IF_END]]:
+; SLOW-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; SLOW-NEXT: ret i64 [[RES]]
+;
+; FAST-LABEL: define i64 @test_ult_2(
+; FAST-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; FAST-NEXT: [[ENTRY:.*]]:
+; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; FAST-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; FAST-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; FAST-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; FAST: [[IF_THEN]]:
+; FAST-NEXT: br label %[[IF_END]]
+; FAST: [[IF_END]]:
+; FAST-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; FAST-NEXT: ret i64 [[RES]]
+;
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %y, 0
+ %cmp = or i1 %cmp2, %cmp1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ br label %if.end
+
+if.end:
+ %res = phi i64 [ %a, %if.then ], [ %b, %entry ]
+ ret i64 %res
+}
+
+define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
+; SLOW-LABEL: define i64 @test_ugt_1(
+; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; SLOW-NEXT: [[ENTRY:.*]]:
+; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; SLOW: [[IF_THEN]]:
+; SLOW-NEXT: br label %[[IF_END]]
+; SLOW: [[IF_END]]:
+; SLOW-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; SLOW-NEXT: ret i64 [[RES]]
+;
+; FAST-LABEL: define i64 @test_ugt_1(
+; FAST-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[ENTRY:.*]]:
+; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; FAST-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; FAST-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; FAST-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; FAST: [[IF_THEN]]:
+; FAST-NEXT: br label %[[IF_END]]
+; FAST: [[IF_END]]:
+; FAST-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; FAST-NEXT: ret i64 [[RES]]
+;
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %y, 0
+ %cmp = or i1 %cmp2, %cmp1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ br label %if.end
+
+if.end:
+ %res = phi i64 [ %a, %if.then ], [ %b, %entry ]
+ ret i64 %res
+}
>From 8446e05fa61fc268eacccfefda65c7a08da2dfe7 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Sat, 10 Aug 2024 10:30:27 +0300
Subject: [PATCH 2/2] [CodeGenPrepare] Unfold slow ctpop when used in
power-of-two test
DAG combiner already does this transformation, but in some cases it does
not have a chance because either CodeGenPrepare or SelectionDAGBuilder
move icmp to a different basic block.
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 38 +++++++++++++++++++
llvm/test/CodeGen/RISCV/pr101786.ll | 32 +---------------
llvm/test/CodeGen/RISCV/rv32zbb.ll | 24 ++++++------
llvm/test/CodeGen/X86/pr94829.ll | 26 +------------
.../CodeGenPrepare/unfold-pow2-test.ll | 10 +++--
5 files changed, 60 insertions(+), 70 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22d0708f547860..ca6afc69919a89 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -473,6 +473,7 @@ class CodeGenPrepare {
bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool unfoldPow2Test(CmpInst *Cmp);
void verifyBFIUpdates(Function &F);
bool _run(Function &F);
};
@@ -1757,6 +1758,40 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return true;
}
+// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
+bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
+ ICmpInst::Predicate Pred;
+ Value *X;
+ uint64_t C;
+
+ if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
+ m_ConstantInt(C))))
+ return false;
+
+ Type *Ty = X->getType();
+ if (Ty->isVectorTy() || TTI->getPopcntSupport(Ty->getIntegerBitWidth()) ==
+ TargetTransformInfo::PSK_FastHardware)
+ return false;
+
+ // (ctpop x) u< 2 -> (x & (x - 1)) == 0
+ // (ctpop x) u> 1 -> (x & (x - 1)) != 0
+ if ((Pred == CmpInst::ICMP_ULT && C == 2) ||
+ (Pred == CmpInst::ICMP_UGT && C == 1)) {
+ IRBuilder<> Builder(Cmp);
+ Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(Ty));
+ Value *And = Builder.CreateAnd(X, Sub);
+ CmpInst::Predicate NewPred =
+ Pred == CmpInst::ICMP_ULT ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
+ Value *NewCmp =
+ Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(Ty));
+ Cmp->replaceAllUsesWith(NewCmp);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ return false;
+}
+
/// Sink the given CmpInst into user blocks to reduce the number of virtual
/// registers that must be created and coalesced. This is a clear win except on
/// targets with multiple condition code registers (PowerPC), where it might
@@ -1984,6 +2019,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
return true;
+ if (unfoldPow2Test(Cmp))
+ return true;
+
if (foldICmpWithDominatingICmp(Cmp, *TLI))
return true;
diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll
index afac9e18da1eea..6d0736edd3e898 100644
--- a/llvm/test/CodeGen/RISCV/pr101786.ll
+++ b/llvm/test/CodeGen/RISCV/pr101786.ll
@@ -8,37 +8,9 @@ define i64 @test(i64 %x, ptr %p) {
; CHECK-NEXT: li a0, 0
; CHECK-NEXT: bgtz a2, .LBB0_3
; CHECK-NEXT: # %bb.1: # %entry
-; CHECK-NEXT: srli a3, a2, 1
-; CHECK-NEXT: lui a4, 349525
-; CHECK-NEXT: addiw a4, a4, 1365
-; CHECK-NEXT: slli a5, a4, 32
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: and a3, a3, a4
-; CHECK-NEXT: sub a2, a2, a3
-; CHECK-NEXT: lui a3, 209715
-; CHECK-NEXT: addiw a3, a3, 819
-; CHECK-NEXT: slli a4, a3, 32
-; CHECK-NEXT: add a3, a3, a4
-; CHECK-NEXT: and a4, a2, a3
-; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: addi a3, a2, -1
; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: add a2, a4, a2
-; CHECK-NEXT: srli a3, a2, 4
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: lui a3, 61681
-; CHECK-NEXT: addiw a3, a3, -241
-; CHECK-NEXT: slli a4, a3, 32
-; CHECK-NEXT: add a3, a3, a4
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: slli a3, a2, 8
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: slli a3, a2, 16
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: slli a3, a2, 32
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: srli a2, a2, 56
-; CHECK-NEXT: li a3, 1
-; CHECK-NEXT: bltu a3, a2, .LBB0_3
+; CHECK-NEXT: bnez a2, .LBB0_3
; CHECK-NEXT: # %bb.2: # %if.else
; CHECK-NEXT: ld a0, 0(a1)
; CHECK-NEXT: .LBB0_3: # %if.end
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index fa320f53cec6ce..888bee577d106c 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_two:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi a2, a0, -1
-; RV32I-NEXT: and a2, a0, a2
-; RV32I-NEXT: seqz a0, a0
-; RV32I-NEXT: sub a0, a1, a0
-; RV32I-NEXT: and a0, a1, a0
-; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a2, a0
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: addi a3, a0, -1
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: ret
;
@@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi a2, a0, -1
-; RV32I-NEXT: and a2, a0, a2
-; RV32I-NEXT: seqz a0, a0
-; RV32I-NEXT: sub a0, a1, a0
-; RV32I-NEXT: and a0, a1, a0
-; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a2, a0
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: addi a3, a0, -1
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/X86/pr94829.ll b/llvm/test/CodeGen/X86/pr94829.ll
index 3fc5db79074101..8d078b2f8e4b9a 100644
--- a/llvm/test/CodeGen/X86/pr94829.ll
+++ b/llvm/test/CodeGen/X86/pr94829.ll
@@ -4,30 +4,8 @@
define ptr @test(i64 %x) {
; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: shrq $2, %rdi
-; CHECK-NEXT: andq %rax, %rdi
-; CHECK-NEXT: addq %rcx, %rdi
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: addq %rdi, %rax
-; CHECK-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; CHECK-NEXT: imulq %rcx, %rax
-; CHECK-NEXT: shrq $56, %rax
-; CHECK-NEXT: cmpq $2, %rax
-; CHECK-NEXT: jb .LBB0_2
-; CHECK-NEXT: # %bb.1: # %if.else
-; CHECK-NEXT: cmpl $2, %eax
-; CHECK-NEXT: .LBB0_2: # %exit1
+; CHECK-NEXT: leaq -1(%rdi), %rax
+; CHECK-NEXT: testq %rax, %rdi
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
index dd9f51cec95240..716bfdfe194119 100644
--- a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
@@ -12,8 +12,9 @@ define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
; SLOW-LABEL: define i64 @test_ult_2(
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
; SLOW-NEXT: [[ENTRY:.*]]:
-; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
-; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -56,8 +57,9 @@ define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
; SLOW-LABEL: define i64 @test_ugt_1(
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
; SLOW-NEXT: [[ENTRY:.*]]:
-; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
-; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
More information about the llvm-commits
mailing list