[llvm] [CodeGenPrepare] Unfold slow ctpop when used in power-of-two test (PR #102731)

Sat Apr 19 22:20:18 PDT 2025

llvmbot wrote:



@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-globalisel

Author: Sergei Barannikov (s-barannikov)

<details>
<summary>Changes</summary>

DAG combiner already does this transformation, but in some cases it does
not have a chance because either CodeGenPrepare or SelectionDAGBuilder
move icmp to a different basic block.

https://alive2.llvm.org/ce/z/ARzh99

Fixes #94829 


---

Patch is 80.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102731.diff


15 Files Affected:

- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+59) 
- (modified) llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll (+8-8) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll (+41-164) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll (+3-25) 
- (added) llvm/test/CodeGen/RISCV/pr101786.ll (+32) 
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+48-48) 
- (modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+24-24) 
- (modified) llvm/test/CodeGen/X86/ispow2.ll (+22-22) 
- (added) llvm/test/CodeGen/X86/pr94829.ll (+27) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-128.ll (+16-18) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll (+68-68) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-256.ll (+143-145) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll (+40-40) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-512.ll (+56-56) 
- (added) llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll (+99) 


``````````diff

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 12a668507fe65..1c32f22234614 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -474,6 +474,7 @@ class CodeGenPrepare {
   bool optimizeURem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool unfoldPow2Test(CmpInst *Cmp);
   void verifyBFIUpdates(Function &F);
   bool _run(Function &F);
 };
@@ -1762,6 +1763,61 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
   return true;
 }
 
+// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
+bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
+  CmpPredicate Pred;
+  Value *X;
+  const APInt *C;
+
+  // (icmp (ctpop x), c)
+  if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
+                         m_APIntAllowPoison(C))))
+    return false;
+
+  // This transformation increases the number of instructions, don't do it if
+  // ctpop is fast.
+  Type *OpTy = X->getType();
+  if (TLI->isCtpopFast(TLI->getValueType(*DL, OpTy)))
+    return false;
+
+  // ctpop(x) u< 2 -> (x & (x - 1)) == 0
+  // ctpop(x) u> 1 -> (x & (x - 1)) != 0
+  // Also handles ctpop(x) == 1 and ctpop(x) != 1 if ctpop(x) is known non-zero.
+  if ((Pred == CmpInst::ICMP_ULT && *C == 2) ||
+      (Pred == CmpInst::ICMP_UGT && *C == 1) ||
+      (ICmpInst::isEquality(Pred) && *C == 1 &&
+       isKnownNonZero(Cmp->getOperand(0), *DL))) {
+    IRBuilder<> Builder(Cmp);
+    Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
+    Value *And = Builder.CreateAnd(X, Sub);
+    CmpInst::Predicate NewPred =
+        (Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_EQ)
+            ? CmpInst::ICMP_EQ
+            : CmpInst::ICMP_NE;
+    Value *NewCmp =
+        Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(OpTy));
+    Cmp->replaceAllUsesWith(NewCmp);
+    RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+    return true;
+  }
+
+  // (ctpop x) == 1 -> (x ^ (x - 1)) u> (x - 1)
+  // (ctpop x) != 1 -> (x ^ (x - 1)) u<= (x - 1)
+  if (ICmpInst::isEquality(Pred) && *C == 1) {
+    IRBuilder<> Builder(Cmp);
+    Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
+    Value *Xor = Builder.CreateXor(X, Sub);
+    CmpInst::Predicate NewPred =
+        Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE;
+    Value *NewCmp = Builder.CreateICmp(NewPred, Xor, Sub);
+    Cmp->replaceAllUsesWith(NewCmp);
+    RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+    return true;
+  }
+
+  return false;
+}
+
 /// Sink the given CmpInst into user blocks to reduce the number of virtual
 /// registers that must be created and coalesced. This is a clear win except on
 /// targets with multiple condition code registers (PowerPC), where it might
@@ -2183,6 +2239,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (combineToUSubWithOverflow(Cmp, ModifiedDT))
     return true;
 
+  if (unfoldPow2Test(Cmp))
+    return true;
+
   if (foldICmpWithDominatingICmp(Cmp, *TLI))
     return true;
 
diff --git a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
index ff7f1fc902981..04351346745b3 100644
--- a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
@@ -11945,11 +11945,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
 ; PWR5-LABEL: ugt_1_v2i64:
 ; PWR5:       # %bb.0:
 ; PWR5-NEXT:    addi 5, 3, -1
+; PWR5-NEXT:    addi 6, 4, -1
 ; PWR5-NEXT:    and 3, 3, 5
-; PWR5-NEXT:    addi 5, 4, -1
+; PWR5-NEXT:    and 4, 4, 6
 ; PWR5-NEXT:    subfic 3, 3, 0
 ; PWR5-NEXT:    subfe 3, 3, 3
-; PWR5-NEXT:    and 4, 4, 5
 ; PWR5-NEXT:    subfic 4, 4, 0
 ; PWR5-NEXT:    subfe 4, 4, 4
 ; PWR5-NEXT:    blr
@@ -11957,11 +11957,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
 ; PWR6-LABEL: ugt_1_v2i64:
 ; PWR6:       # %bb.0:
 ; PWR6-NEXT:    addi 5, 3, -1
+; PWR6-NEXT:    addi 6, 4, -1
 ; PWR6-NEXT:    and 3, 3, 5
-; PWR6-NEXT:    addi 5, 4, -1
+; PWR6-NEXT:    and 4, 4, 6
 ; PWR6-NEXT:    subfic 3, 3, 0
 ; PWR6-NEXT:    subfe 3, 3, 3
-; PWR6-NEXT:    and 4, 4, 5
 ; PWR6-NEXT:    subfic 4, 4, 0
 ; PWR6-NEXT:    subfe 4, 4, 4
 ; PWR6-NEXT:    blr
@@ -12016,11 +12016,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
 ; PWR5-LABEL: ult_2_v2i64:
 ; PWR5:       # %bb.0:
 ; PWR5-NEXT:    addi 5, 3, -1
+; PWR5-NEXT:    addi 6, 4, -1
 ; PWR5-NEXT:    and 3, 3, 5
-; PWR5-NEXT:    addi 5, 4, -1
+; PWR5-NEXT:    and 4, 4, 6
 ; PWR5-NEXT:    addic 3, 3, -1
 ; PWR5-NEXT:    subfe 3, 3, 3
-; PWR5-NEXT:    and 4, 4, 5
 ; PWR5-NEXT:    addic 4, 4, -1
 ; PWR5-NEXT:    subfe 4, 4, 4
 ; PWR5-NEXT:    blr
@@ -12028,11 +12028,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
 ; PWR6-LABEL: ult_2_v2i64:
 ; PWR6:       # %bb.0:
 ; PWR6-NEXT:    addi 5, 3, -1
+; PWR6-NEXT:    addi 6, 4, -1
 ; PWR6-NEXT:    and 3, 3, 5
-; PWR6-NEXT:    addi 5, 4, -1
+; PWR6-NEXT:    and 4, 4, 6
 ; PWR6-NEXT:    addic 3, 3, -1
 ; PWR6-NEXT:    subfe 3, 3, 3
-; PWR6-NEXT:    and 4, 4, 5
 ; PWR6-NEXT:    addic 4, 4, -1
 ; PWR6-NEXT:    subfe 4, 4, 4
 ; PWR6-NEXT:    blr
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index 95af7861d4798..f9af74d6ec323 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -357,49 +357,14 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
 ; RV32I-LABEL: ctpop_i64_ugt_two:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    j .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltiu a0, zero, 0
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a3, 349525
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    addi a3, a3, 1365
-; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    lui a5, 61681
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    addi a5, a5, -241
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    srli a3, a1, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    slli a3, a1, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    slli a3, a1, 16
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    sltiu a0, a0, 2
+; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
+; RV32I-NEXT:    sltiu a4, a2, -1
+; RV32I-NEXT:    add a3, a3, a4
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_i64_ugt_two:
@@ -422,50 +387,14 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
 define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
 ; RV32I-LABEL: ctpop_i64_ugt_one:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    j .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    snez a0, zero
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a3, 349525
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    addi a3, a3, 1365
-; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    lui a5, 61681
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    addi a5, a5, -241
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    srli a3, a1, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    slli a3, a1, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    slli a3, a1, 16
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    sltiu a0, a0, 2
-; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
+; RV32I-NEXT:    sltiu a4, a2, -1
+; RV32I-NEXT:    add a3, a3, a4
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_i64_ugt_one:
@@ -489,45 +418,18 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
 define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
 ; RV32I-LABEL: ctpop_i64_eq_one:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a3, 349525
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    addi a3, a3, 1365
-; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    lui a5, 61681
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    addi a5, a5, -241
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    srli a3, a1, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    slli a3, a1, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    slli a3, a1, 16
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    xori a0, a0, 1
-; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    sltiu a3, a2, -1
+; RV32I-NEXT:    addi a4, a1, -1
+; RV32I-NEXT:    add a3, a4, a3
+; RV32I-NEXT:    xor a1, a1, a3
+; RV32I-NEXT:    beq a1, a3, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a0, a3, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_i64_eq_one:
@@ -546,45 +448,20 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
 define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
 ; RV32I-LABEL: ctpop_i64_ne_one:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a3, 349525
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    addi a3, a3, 1365
-; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    lui a5, 61681
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    addi a5, a5, -241
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    srli a3, a1, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    slli a3, a1, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    slli a3, a1, 16
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    sltiu a3, a2, -1
+; RV32I-NEXT:    addi a4, a1, -1
+; RV32I-NEXT:    add a3, a4, a3
+; RV32I-NEXT:    xor a1, a1, a3
+; RV32I-NEXT:    beq a1, a3, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a0, a3, a1
+; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    xori a0, a0, 1
-; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_i64_ne_one:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9a6c718703a27..8549a7c526e45 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -701,31 +701,9 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
 ; RV64I-LABEL: ctpop_i32_ult_two:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    sraiw a1, a0, 4
-; RV64I-NEXT:    addw a0, a1, a0
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    sltiu a0, a0, 2
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    addiw a1, a0, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: ctpop_i32_ult_two:
diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll
new file mode 100644
index 0000000000000..6d0736edd3e89
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr101786.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=riscv64 -o - %s | FileCheck %s
+
+define i64 @test(i64 %x, ptr %p) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    bgtz a2, .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    addi a3, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    bnez a2, .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %if.else
+; CHECK-NEXT:    ld a0, 0(a1)
+; CHECK-NEXT:  .LBB0_3: # %if.end
+; CHECK-NEXT:    ret
+entry:
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+  %cmp1 = icmp ugt i64 %ctpop, 1
+  %cmp2 = icmp sgt i64 %x, 0
+  %or = or i1 %cmp2, %cmp1
+  br i1 %or, label %if.end, label %if.else
+
+if.else:
+  %load = load i64, ptr %p, align 8
+  br label %if.end
+
+if.end:
+  %res = phi i64 [0, %entry], [%load, %if.else]
+  ret i64 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 1b9b1b89aeb7e..98c86da41afa1 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -416,9 +416,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i32_ult_two:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    ret
@@ -439,9 +439,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i32_ugt_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    ret
@@ -464,11 +464,11 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i32_eq_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    xor a1, a1, a2
-; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    sltu a1, a3, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_v2i32_eq_one:
@@ -489,11 +489,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i32_ne_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    xor a1, a1, a2
-; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    sltu a1, a3, a1
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    xori a1, a1, 1
 ; RV32I-NEXT:    ret
@@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
 ; RV32I-LABEL: ctpop_i64_ugt_two:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a2, a0, -1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    sub a0, a1, a0
-; RV32I-NEXT:    and a0, a1, a0
-; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    seqz a2, a0
+; RV32I-NEXT:    addi a3, a0, -1
+; RV32I-NEXT:    sub a2, a1, a2
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    ret
 ;
@@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
 define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
 ; RV32I-LABEL: ctpop_i64_ugt_one:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a2, a0, -1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    sub a0, a1, a0
-; RV32I-NEXT:    and a0, a1, a0
-; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    seqz a2, a0
+; RV32I-NEXT:    addi a3, a0, -1
+; RV32I-NEXT:    sub a2, a1, a2
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    ret
 ;
@@ -785,20 +785,20 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ult_two:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 4(a0)
-; RV32I-NEXT:    lw a3, 8(a0)
+; RV32I-NEXT:    lw a2, 8(a0)
+; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
-; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    and a1, a2, a1
-; RV32I-NEXT:    addi a2, a3, -1
-; RV32I-NEXT:    and a2, a3, a2
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a0, a3
-; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    seqz a4, a1
+; RV32I-NEXT:    seqz a5, a2
+; RV32I-NEXT:    addi a6, a1, -1
+; RV32I-NEXT:    addi a7, a2, -1
+; RV32I-NEXT:    sub a4, a3, a4
+; RV32I-NEXT:    sub a5, a0, a5
+; RV32I-NEXT:    and a2, a2, a7
+; RV32I-NEXT:    and a1, a1, a6
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    or a2, a2, a0
 ; RV32I-NEXT:    seqz a0, a1
 ; RV32I-NEXT:    seqz a1, a2
@@ -828,20 +828,20 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV3...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/102731