[llvm] [CodeGenPrepare] Unfold slow ctpop when used in power-of-two test (PR #102731)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 19 22:20:18 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-llvm-globalisel
Author: Sergei Barannikov (s-barannikov)
<details>
<summary>Changes</summary>
DAG combiner already does this transformation, but in some cases it does
not have a chance because either CodeGenPrepare or SelectionDAGBuilder
move icmp to a different basic block.
https://alive2.llvm.org/ce/z/ARzh99
Fixes #<!-- -->94829
---
Patch is 80.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102731.diff
15 Files Affected:
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+59)
- (modified) llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll (+8-8)
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll (+41-164)
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll (+3-25)
- (added) llvm/test/CodeGen/RISCV/pr101786.ll (+32)
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+48-48)
- (modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+24-24)
- (modified) llvm/test/CodeGen/X86/ispow2.ll (+22-22)
- (added) llvm/test/CodeGen/X86/pr94829.ll (+27)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-128.ll (+16-18)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll (+68-68)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-256.ll (+143-145)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll (+40-40)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-512.ll (+56-56)
- (added) llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll (+99)
``````````diff
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 12a668507fe65..1c32f22234614 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -474,6 +474,7 @@ class CodeGenPrepare {
bool optimizeURem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool unfoldPow2Test(CmpInst *Cmp);
void verifyBFIUpdates(Function &F);
bool _run(Function &F);
};
@@ -1762,6 +1763,61 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return true;
}
+// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
+bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
+ CmpPredicate Pred;
+ Value *X;
+ const APInt *C;
+
+ // (icmp (ctpop x), c)
+ if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
+ m_APIntAllowPoison(C))))
+ return false;
+
+ // This transformation increases the number of instructions, don't do it if
+ // ctpop is fast.
+ Type *OpTy = X->getType();
+ if (TLI->isCtpopFast(TLI->getValueType(*DL, OpTy)))
+ return false;
+
+ // ctpop(x) u< 2 -> (x & (x - 1)) == 0
+ // ctpop(x) u> 1 -> (x & (x - 1)) != 0
+ // Also handles ctpop(x) == 1 and ctpop(x) != 1 if ctpop(x) is known non-zero.
+ if ((Pred == CmpInst::ICMP_ULT && *C == 2) ||
+ (Pred == CmpInst::ICMP_UGT && *C == 1) ||
+ (ICmpInst::isEquality(Pred) && *C == 1 &&
+ isKnownNonZero(Cmp->getOperand(0), *DL))) {
+ IRBuilder<> Builder(Cmp);
+ Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
+ Value *And = Builder.CreateAnd(X, Sub);
+ CmpInst::Predicate NewPred =
+ (Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_EQ)
+ ? CmpInst::ICMP_EQ
+ : CmpInst::ICMP_NE;
+ Value *NewCmp =
+ Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(OpTy));
+ Cmp->replaceAllUsesWith(NewCmp);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ // (ctpop x) == 1 -> (x ^ (x - 1)) u> (x - 1)
+ // (ctpop x) != 1 -> (x ^ (x - 1)) u<= (x - 1)
+ if (ICmpInst::isEquality(Pred) && *C == 1) {
+ IRBuilder<> Builder(Cmp);
+ Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
+ Value *Xor = Builder.CreateXor(X, Sub);
+ CmpInst::Predicate NewPred =
+ Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE;
+ Value *NewCmp = Builder.CreateICmp(NewPred, Xor, Sub);
+ Cmp->replaceAllUsesWith(NewCmp);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ return false;
+}
+
/// Sink the given CmpInst into user blocks to reduce the number of virtual
/// registers that must be created and coalesced. This is a clear win except on
/// targets with multiple condition code registers (PowerPC), where it might
@@ -2183,6 +2239,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
return true;
+ if (unfoldPow2Test(Cmp))
+ return true;
+
if (foldICmpWithDominatingICmp(Cmp, *TLI))
return true;
diff --git a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
index ff7f1fc902981..04351346745b3 100644
--- a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
@@ -11945,11 +11945,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; PWR5-LABEL: ugt_1_v2i64:
; PWR5: # %bb.0:
; PWR5-NEXT: addi 5, 3, -1
+; PWR5-NEXT: addi 6, 4, -1
; PWR5-NEXT: and 3, 3, 5
-; PWR5-NEXT: addi 5, 4, -1
+; PWR5-NEXT: and 4, 4, 6
; PWR5-NEXT: subfic 3, 3, 0
; PWR5-NEXT: subfe 3, 3, 3
-; PWR5-NEXT: and 4, 4, 5
; PWR5-NEXT: subfic 4, 4, 0
; PWR5-NEXT: subfe 4, 4, 4
; PWR5-NEXT: blr
@@ -11957,11 +11957,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; PWR6-LABEL: ugt_1_v2i64:
; PWR6: # %bb.0:
; PWR6-NEXT: addi 5, 3, -1
+; PWR6-NEXT: addi 6, 4, -1
; PWR6-NEXT: and 3, 3, 5
-; PWR6-NEXT: addi 5, 4, -1
+; PWR6-NEXT: and 4, 4, 6
; PWR6-NEXT: subfic 3, 3, 0
; PWR6-NEXT: subfe 3, 3, 3
-; PWR6-NEXT: and 4, 4, 5
; PWR6-NEXT: subfic 4, 4, 0
; PWR6-NEXT: subfe 4, 4, 4
; PWR6-NEXT: blr
@@ -12016,11 +12016,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; PWR5-LABEL: ult_2_v2i64:
; PWR5: # %bb.0:
; PWR5-NEXT: addi 5, 3, -1
+; PWR5-NEXT: addi 6, 4, -1
; PWR5-NEXT: and 3, 3, 5
-; PWR5-NEXT: addi 5, 4, -1
+; PWR5-NEXT: and 4, 4, 6
; PWR5-NEXT: addic 3, 3, -1
; PWR5-NEXT: subfe 3, 3, 3
-; PWR5-NEXT: and 4, 4, 5
; PWR5-NEXT: addic 4, 4, -1
; PWR5-NEXT: subfe 4, 4, 4
; PWR5-NEXT: blr
@@ -12028,11 +12028,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; PWR6-LABEL: ult_2_v2i64:
; PWR6: # %bb.0:
; PWR6-NEXT: addi 5, 3, -1
+; PWR6-NEXT: addi 6, 4, -1
; PWR6-NEXT: and 3, 3, 5
-; PWR6-NEXT: addi 5, 4, -1
+; PWR6-NEXT: and 4, 4, 6
; PWR6-NEXT: addic 3, 3, -1
; PWR6-NEXT: subfe 3, 3, 3
-; PWR6-NEXT: and 4, 4, 5
; PWR6-NEXT: addic 4, 4, -1
; PWR6-NEXT: subfe 4, 4, 4
; PWR6-NEXT: blr
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index 95af7861d4798..f9af74d6ec323 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -357,49 +357,14 @@ define i64 @ctpop_i64(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_two:
; RV32I: # %bb.0:
-; RV32I-NEXT: j .LBB6_2
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sltiu a0, zero, 0
-; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB6_2:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: sltiu a0, a0, 2
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: sltiu a4, a2, -1
+; RV32I-NEXT: add a3, a3, a4
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_ugt_two:
@@ -422,50 +387,14 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: j .LBB7_2
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: snez a0, zero
-; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB7_2:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: sltiu a0, a0, 2
-; RV32I-NEXT: xori a0, a0, 1
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: sltiu a4, a2, -1
+; RV32I-NEXT: add a3, a3, a4
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_ugt_one:
@@ -489,45 +418,18 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_eq_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: xori a0, a0, 1
-; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: sltiu a3, a2, -1
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: add a3, a4, a3
+; RV32I-NEXT: xor a1, a1, a3
+; RV32I-NEXT: beq a1, a3, .LBB8_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sltu a0, a3, a1
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: xor a0, a0, a2
+; RV32I-NEXT: sltu a0, a2, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_eq_one:
@@ -546,45 +448,20 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ne_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: sltiu a3, a2, -1
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: add a3, a4, a3
+; RV32I-NEXT: xor a1, a1, a3
+; RV32I-NEXT: beq a1, a3, .LBB9_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sltu a0, a3, a1
+; RV32I-NEXT: xori a0, a0, 1
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB9_2:
+; RV32I-NEXT: xor a0, a0, a2
+; RV32I-NEXT: sltu a0, a2, a0
; RV32I-NEXT: xori a0, a0, 1
-; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_ne_one:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9a6c718703a27..8549a7c526e45 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -701,31 +701,9 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
; RV64I-LABEL: ctpop_i32_ult_two:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -16
-; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srliw a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addi a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: lui a2, 209715
-; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 2
-; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: sraiw a1, a0, 4
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a2, a2, -241
-; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: sltiu a0, a0, 2
-; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_i32_ult_two:
diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll
new file mode 100644
index 0000000000000..6d0736edd3e89
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr101786.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=riscv64 -o - %s | FileCheck %s
+
+define i64 @test(i64 %x, ptr %p) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: bgtz a2, .LBB0_3
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: addi a3, a2, -1
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: bnez a2, .LBB0_3
+; CHECK-NEXT: # %bb.2: # %if.else
+; CHECK-NEXT: ld a0, 0(a1)
+; CHECK-NEXT: .LBB0_3: # %if.end
+; CHECK-NEXT: ret
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %x, 0
+ %or = or i1 %cmp2, %cmp1
+ br i1 %or, label %if.end, label %if.else
+
+if.else:
+ %load = load i64, ptr %p, align 8
+ br label %if.end
+
+if.end:
+ %res = phi i64 [0, %entry], [%load, %if.else]
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 1b9b1b89aeb7e..98c86da41afa1 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -416,9 +416,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_ult_two:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: seqz a1, a1
; RV32I-NEXT: ret
@@ -439,9 +439,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_ugt_one:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: snez a1, a1
; RV32I-NEXT: ret
@@ -464,11 +464,11 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_eq_one:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: xor a1, a1, a3
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: xor a1, a1, a2
-; RV32I-NEXT: sltu a1, a2, a1
+; RV32I-NEXT: sltu a1, a3, a1
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_v2i32_eq_one:
@@ -489,11 +489,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_ne_one:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: xor a1, a1, a3
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: xor a1, a1, a2
-; RV32I-NEXT: sltu a1, a2, a1
+; RV32I-NEXT: sltu a1, a3, a1
; RV32I-NEXT: xori a0, a0, 1
; RV32I-NEXT: xori a1, a1, 1
; RV32I-NEXT: ret
@@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_two:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi a2, a0, -1
-; RV32I-NEXT: and a2, a0, a2
-; RV32I-NEXT: seqz a0, a0
-; RV32I-NEXT: sub a0, a1, a0
-; RV32I-NEXT: and a0, a1, a0
-; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a2, a0
+; RV32I-NEXT: addi a3, a0, -1
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: ret
;
@@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi a2, a0, -1
-; RV32I-NEXT: and a2, a0, a2
-; RV32I-NEXT: seqz a0, a0
-; RV32I-NEXT: sub a0, a1, a0
-; RV32I-NEXT: and a0, a1, a0
-; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a2, a0
+; RV32I-NEXT: addi a3, a0, -1
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
@@ -785,20 +785,20 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
; RV32I-LABEL: ctpop_v2i64_ult_two:
; RV32I: # %bb.0:
; RV32I-NEXT: lw a1, 0(a0)
-; RV32I-NEXT: lw a2, 4(a0)
-; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a2, 8(a0)
+; RV32I-NEXT: lw a3, 4(a0)
; RV32I-NEXT: lw a0, 12(a0)
-; RV32I-NEXT: addi a4, a1, -1
-; RV32I-NEXT: and a4, a1, a4
-; RV32I-NEXT: seqz a1, a1
-; RV32I-NEXT: sub a1, a2, a1
-; RV32I-NEXT: and a1, a2, a1
-; RV32I-NEXT: addi a2, a3, -1
-; RV32I-NEXT: and a2, a3, a2
-; RV32I-NEXT: seqz a3, a3
-; RV32I-NEXT: sub a3, a0, a3
-; RV32I-NEXT: and a0, a0, a3
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: seqz a4, a1
+; RV32I-NEXT: seqz a5, a2
+; RV32I-NEXT: addi a6, a1, -1
+; RV32I-NEXT: addi a7, a2, -1
+; RV32I-NEXT: sub a4, a3, a4
+; RV32I-NEXT: sub a5, a0, a5
+; RV32I-NEXT: and a2, a2, a7
+; RV32I-NEXT: and a1, a1, a6
+; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a3, a3, a4
+; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: or a2, a2, a0
; RV32I-NEXT: seqz a0, a1
; RV32I-NEXT: seqz a1, a2
@@ -828,20 +828,20 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
; RV3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102731
More information about the llvm-commits
mailing list