[llvm] e042ff7 - [SDAG][RISCV] Avoid expanding is-power-of-2 pattern on riscv32/64 with zbb
Yingwei Zheng via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 16 11:56:48 PDT 2023
Author: Yingwei Zheng
Date: 2023-09-17T02:56:09+08:00
New Revision: e042ff7eefff6037ffe8350db7c52080a189cce8
URL: https://github.com/llvm/llvm-project/commit/e042ff7eefff6037ffe8350db7c52080a189cce8
DIFF: https://github.com/llvm/llvm-project/commit/e042ff7eefff6037ffe8350db7c52080a189cce8.diff
LOG: [SDAG][RISCV] Avoid expanding is-power-of-2 pattern on riscv32/64 with zbb
This patch adjusts the legality check for riscv to use `cpop/cpopw` since `isOperationLegal(ISD::CTPOP, MVT::i32)` returns false on rv64gc_zbb.
Clang vs gcc: https://godbolt.org/z/rc3s4hjPh
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D156390
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/test/CodeGen/RISCV/rv32zbb.ll
llvm/test/CodeGen/RISCV/rv64zbb.ll
llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eea3372a0e5e300..bd802bd4b173a0b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -651,6 +651,11 @@ class TargetLoweringBase {
return false;
}
+ /// Return true if ctpop instruction is fast.
+ virtual bool isCtpopFast(EVT VT) const {
+ return isOperationLegal(ISD::CTPOP, VT);
+ }
+
/// Return the maximum number of "x & (x - 1)" operations that can be done
/// instead of deferring to a custom CTPOP.
virtual unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bd1940994a87f0f..23c1486f711d727 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4168,8 +4168,8 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
// (ctpop x) u< 2 -> (x & x-1) == 0
// (ctpop x) u> 1 -> (x & x-1) != 0
if (Cond == ISD::SETULT || Cond == ISD::SETUGT) {
- // Keep the CTPOP if it is a legal vector op.
- if (CTVT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT))
+ // Keep the CTPOP if it is a cheap vector op.
+ if (CTVT.isVector() && TLI.isCtpopFast(CTVT))
return SDValue();
unsigned CostLimit = TLI.getCustomCtpopCost(CTVT, Cond);
@@ -4194,8 +4194,8 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
// (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
// (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
- // Keep the CTPOP if it is legal.
- if (TLI.isOperationLegal(ISD::CTPOP, CTVT))
+ // Keep the CTPOP if it is cheap.
+ if (TLI.isCtpopFast(CTVT))
return SDValue();
SDValue Zero = DAG.getConstant(0, dl, CTVT);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1febc0216f5850c..5c3ea3660e6e672 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18585,6 +18585,20 @@ bool RISCVTargetLowering::areTwoSDNodeTargetMMOFlagsMergeable(
return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY);
}
+bool RISCVTargetLowering::isCtpopFast(EVT VT) const {
+ if (VT.isScalableVector())
+ return isTypeLegal(VT) && Subtarget.hasStdExtZvbb();
+ if (VT.isFixedLengthVector() && Subtarget.hasStdExtZvbb())
+ return true;
+ return Subtarget.hasStdExtZbb() &&
+ (VT == MVT::i32 || VT == MVT::i64 || VT.isFixedLengthVector());
+}
+
+unsigned RISCVTargetLowering::getCustomCtpopCost(EVT VT,
+ ISD::CondCode Cond) const {
+ return isCtpopFast(VT) ? 0 : 1;
+}
+
namespace llvm::RISCVVIntrinsicsTable {
#define GET_RISCVVIntrinsicsTable_IMPL
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ac5d228f5ebea27..815b9be47f56026 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -607,6 +607,10 @@ class RISCVTargetLowering : public TargetLowering {
}
bool convertSelectOfConstantsToMath(EVT VT) const override { return true; }
+ bool isCtpopFast(EVT VT) const override;
+
+ unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override;
+
bool preferZeroCompareBranch() const override { return true; }
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 1dee75cce753f5a..92a7ccf5cc7a026 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -309,6 +309,259 @@ define i32 @ctpop_i32(i32 %a) nounwind {
ret i32 %1
}
+define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
+; RV32I-LABEL: ctpop_i32_ult_two:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i32_ult_two:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp ult i32 %1, 2
+ ret i1 %2
+}
+
+define i1 @ctpop_i32_ugt_one(i32 signext %a) nounwind {
+; RV32I-LABEL: ctpop_i32_ugt_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i32_ugt_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: xori a0, a0, 1
+; RV32ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp ugt i32 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i32_eq_one(i32 signext %a) nounwind {
+; RV32I-LABEL: ctpop_i32_eq_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i32_eq_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: seqz a0, a0
+; RV32ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp eq i32 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind {
+; RV32I-LABEL: ctpop_i32_ne_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i32_ne_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: snez a0, a0
+; RV32ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp ne i32 %1, 1
+ ret i1 %2
+}
+
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
+
+define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 349525
+; RV32I-NEXT: addi s3, a2, 1365
+; RV32I-NEXT: and a1, a1, s3
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lui a1, 209715
+; RV32I-NEXT: addi s4, a1, 819
+; RV32I-NEXT: and a1, a0, s4
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, s4
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: lui a1, 61681
+; RV32I-NEXT: addi s5, a1, -241
+; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: lui a1, 4112
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __mulsi3 at plt
+; RV32I-NEXT: srli s2, a0, 24
+; RV32I-NEXT: srli a0, s0, 1
+; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: sub s0, s0, a0
+; RV32I-NEXT: and a0, s0, s4
+; RV32I-NEXT: srli s0, s0, 2
+; RV32I-NEXT: and a1, s0, s4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __mulsi3 at plt
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i32:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ ret <2 x i32> %1
+}
+
+define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i32_ult_two:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: addi a2, a1, -1
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i32_ult_two:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: sltiu a1, a1, 2
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp ult <2 x i32> %1, <i32 2, i32 2>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i32_ugt_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: addi a2, a1, -1
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i32_ugt_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: xori a0, a0, 1
+; RV32ZBB-NEXT: sltiu a1, a1, 2
+; RV32ZBB-NEXT: xori a1, a1, 1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp ugt <2 x i32> %1, <i32 1, i32 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i32_eq_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: seqz a2, a2
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: addi a2, a1, -1
+; RV32I-NEXT: and a2, a1, a2
+; RV32I-NEXT: seqz a2, a2
+; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i32_eq_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: seqz a0, a0
+; RV32ZBB-NEXT: addi a1, a1, -1
+; RV32ZBB-NEXT: seqz a1, a1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp eq <2 x i32> %1, <i32 1, i32 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i32_ne_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: snez a2, a2
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: addi a2, a1, -1
+; RV32I-NEXT: and a2, a1, a2
+; RV32I-NEXT: snez a2, a2
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i32_ne_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: snez a0, a0
+; RV32ZBB-NEXT: addi a1, a1, -1
+; RV32ZBB-NEXT: snez a1, a1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp ne <2 x i32> %1, <i32 1, i32 1>
+ ret <2 x i1> %2
+}
+
declare i64 @llvm.ctpop.i64(i64)
define i64 @ctpop_i64(i64 %a) nounwind {
@@ -380,6 +633,422 @@ define i64 @ctpop_i64(i64 %a) nounwind {
ret i64 %1
}
+define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
+; RV32I-LABEL: ctpop_i64_ugt_two:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: sub a0, a1, a0
+; RV32I-NEXT: and a0, a1, a0
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i64_ugt_two:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp ult i64 %1, 2
+ ret i1 %2
+}
+
+define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
+; RV32I-LABEL: ctpop_i64_ugt_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: sub a0, a1, a0
+; RV32I-NEXT: and a0, a1, a0
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i64_ugt_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: xori a0, a0, 1
+; RV32ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp ugt i64 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
+; RV32I-LABEL: ctpop_i64_eq_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: seqz a3, a0
+; RV32I-NEXT: sub a3, a1, a3
+; RV32I-NEXT: and a3, a1, a3
+; RV32I-NEXT: or a2, a2, a3
+; RV32I-NEXT: seqz a2, a2
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i64_eq_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: seqz a0, a0
+; RV32ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp eq i64 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
+; RV32I-LABEL: ctpop_i64_ne_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: seqz a3, a0
+; RV32I-NEXT: sub a3, a1, a3
+; RV32I-NEXT: and a3, a1, a3
+; RV32I-NEXT: or a2, a2, a3
+; RV32I-NEXT: snez a2, a2
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_i64_ne_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: add a0, a0, a1
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: snez a0, a0
+; RV32ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp ne i64 %1, 1
+ ret i1 %2
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
+
+define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -48
+; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: lw a0, 4(a1)
+; RV32I-NEXT: lw s2, 8(a1)
+; RV32I-NEXT: lw s5, 12(a1)
+; RV32I-NEXT: lw s6, 0(a1)
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 349525
+; RV32I-NEXT: addi s3, a2, 1365
+; RV32I-NEXT: and a1, a1, s3
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lui a1, 209715
+; RV32I-NEXT: addi s4, a1, 819
+; RV32I-NEXT: and a1, a0, s4
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, s4
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: lui a1, 61681
+; RV32I-NEXT: addi s7, a1, -241
+; RV32I-NEXT: and a0, a0, s7
+; RV32I-NEXT: lui a1, 4112
+; RV32I-NEXT: addi s1, a1, 257
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __mulsi3 at plt
+; RV32I-NEXT: srli s8, a0, 24
+; RV32I-NEXT: srli a0, s6, 1
+; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: sub a0, s6, a0
+; RV32I-NEXT: and a1, a0, s4
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, s4
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: and a0, a0, s7
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __mulsi3 at plt
+; RV32I-NEXT: srli a0, a0, 24
+; RV32I-NEXT: add s8, a0, s8
+; RV32I-NEXT: srli a0, s5, 1
+; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: sub a0, s5, a0
+; RV32I-NEXT: and a1, a0, s4
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, s4
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: and a0, a0, s7
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __mulsi3 at plt
+; RV32I-NEXT: srli s5, a0, 24
+; RV32I-NEXT: srli a0, s2, 1
+; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: sub a0, s2, a0
+; RV32I-NEXT: and a1, a0, s4
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, s4
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: and a0, a0, s7
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __mulsi3 at plt
+; RV32I-NEXT: srli a0, a0, 24
+; RV32I-NEXT: add a0, a0, s5
+; RV32I-NEXT: sw zero, 12(s0)
+; RV32I-NEXT: sw zero, 4(s0)
+; RV32I-NEXT: sw a0, 8(s0)
+; RV32I-NEXT: sw s8, 0(s0)
+; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i64:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: lw a2, 4(a1)
+; RV32ZBB-NEXT: lw a3, 0(a1)
+; RV32ZBB-NEXT: lw a4, 8(a1)
+; RV32ZBB-NEXT: lw a1, 12(a1)
+; RV32ZBB-NEXT: cpop a2, a2
+; RV32ZBB-NEXT: cpop a3, a3
+; RV32ZBB-NEXT: add a2, a3, a2
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a3, a4
+; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: sw zero, 12(a0)
+; RV32ZBB-NEXT: sw zero, 4(a0)
+; RV32ZBB-NEXT: sw a1, 8(a0)
+; RV32ZBB-NEXT: sw a2, 0(a0)
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ ret <2 x i64> %1
+}
+
+define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i64_ult_two:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a1, 0(a0)
+; RV32I-NEXT: lw a2, 12(a0)
+; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: sub a1, a0, a1
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: addi a1, a3, -1
+; RV32I-NEXT: and a1, a3, a1
+; RV32I-NEXT: seqz a3, a3
+; RV32I-NEXT: sub a3, a2, a3
+; RV32I-NEXT: and a2, a2, a3
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i64_ult_two:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: lw a1, 12(a0)
+; RV32ZBB-NEXT: lw a2, 8(a0)
+; RV32ZBB-NEXT: lw a3, 0(a0)
+; RV32ZBB-NEXT: lw a0, 4(a0)
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a2, a2
+; RV32ZBB-NEXT: add a1, a2, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: cpop a2, a3
+; RV32ZBB-NEXT: add a0, a2, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: sltiu a1, a1, 2
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp ult <2 x i64> %1, <i64 2, i64 2>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i64_ugt_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a1, 0(a0)
+; RV32I-NEXT: lw a2, 12(a0)
+; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: sub a1, a0, a1
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: addi a1, a3, -1
+; RV32I-NEXT: and a1, a3, a1
+; RV32I-NEXT: seqz a3, a3
+; RV32I-NEXT: sub a3, a2, a3
+; RV32I-NEXT: and a2, a2, a3
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i64_ugt_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: lw a1, 12(a0)
+; RV32ZBB-NEXT: lw a2, 8(a0)
+; RV32ZBB-NEXT: lw a3, 0(a0)
+; RV32ZBB-NEXT: lw a0, 4(a0)
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a2, a2
+; RV32ZBB-NEXT: add a1, a2, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: cpop a2, a3
+; RV32ZBB-NEXT: add a0, a2, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: xori a0, a0, 1
+; RV32ZBB-NEXT: sltiu a1, a1, 2
+; RV32ZBB-NEXT: xori a1, a1, 1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i64_eq_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a1, 0(a0)
+; RV32I-NEXT: lw a2, 12(a0)
+; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: seqz a5, a1
+; RV32I-NEXT: sub a5, a0, a5
+; RV32I-NEXT: and a5, a0, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: seqz a4, a4
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: and a0, a0, a4
+; RV32I-NEXT: addi a1, a3, -1
+; RV32I-NEXT: and a1, a3, a1
+; RV32I-NEXT: seqz a4, a3
+; RV32I-NEXT: sub a4, a2, a4
+; RV32I-NEXT: and a4, a2, a4
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: or a2, a3, a2
+; RV32I-NEXT: snez a2, a2
+; RV32I-NEXT: and a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i64_eq_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: lw a1, 12(a0)
+; RV32ZBB-NEXT: lw a2, 8(a0)
+; RV32ZBB-NEXT: lw a3, 0(a0)
+; RV32ZBB-NEXT: lw a0, 4(a0)
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a2, a2
+; RV32ZBB-NEXT: add a1, a2, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: cpop a2, a3
+; RV32ZBB-NEXT: add a0, a2, a0
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: seqz a0, a0
+; RV32ZBB-NEXT: addi a1, a1, -1
+; RV32ZBB-NEXT: seqz a1, a1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
+; RV32I-LABEL: ctpop_v2i64_ne_one:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lw a1, 0(a0)
+; RV32I-NEXT: lw a2, 12(a0)
+; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: and a4, a1, a4
+; RV32I-NEXT: seqz a5, a1
+; RV32I-NEXT: sub a5, a0, a5
+; RV32I-NEXT: and a5, a0, a5
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: snez a4, a4
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: addi a1, a3, -1
+; RV32I-NEXT: and a1, a3, a1
+; RV32I-NEXT: seqz a4, a3
+; RV32I-NEXT: sub a4, a2, a4
+; RV32I-NEXT: and a4, a2, a4
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: snez a1, a1
+; RV32I-NEXT: or a2, a3, a2
+; RV32I-NEXT: seqz a2, a2
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop_v2i64_ne_one:
+; RV32ZBB: # %bb.0:
+; RV32ZBB-NEXT: lw a1, 12(a0)
+; RV32ZBB-NEXT: lw a2, 8(a0)
+; RV32ZBB-NEXT: lw a3, 0(a0)
+; RV32ZBB-NEXT: lw a0, 4(a0)
+; RV32ZBB-NEXT: cpop a1, a1
+; RV32ZBB-NEXT: cpop a2, a2
+; RV32ZBB-NEXT: add a1, a2, a1
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: cpop a2, a3
+; RV32ZBB-NEXT: add a0, a2, a0
+; RV32ZBB-NEXT: addi a0, a0, -1
+; RV32ZBB-NEXT: snez a0, a0
+; RV32ZBB-NEXT: addi a1, a1, -1
+; RV32ZBB-NEXT: snez a1, a1
+; RV32ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp ne <2 x i64> %1, <i64 1, i64 1>
+ ret <2 x i1> %2
+}
+
define i32 @sextb_i32(i32 %a) nounwind {
; RV32I-LABEL: sextb_i32:
; RV32I: # %bb.0:
@@ -451,10 +1120,10 @@ define i64 @sexth_i64(i64 %a) nounwind {
define i32 @min_i32(i32 %a, i32 %b) nounwind {
; RV32I-LABEL: min_i32:
; RV32I: # %bb.0:
-; RV32I-NEXT: blt a0, a1, .LBB10_2
+; RV32I-NEXT: blt a0, a1, .LBB28_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: .LBB10_2:
+; RV32I-NEXT: .LBB28_2:
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: min_i32:
@@ -474,18 +1143,18 @@ define i32 @min_i32(i32 %a, i32 %b) nounwind {
define i64 @min_i64(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: min_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: beq a1, a3, .LBB11_2
+; CHECK-NEXT: beq a1, a3, .LBB29_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: slt a4, a1, a3
-; CHECK-NEXT: beqz a4, .LBB11_3
-; CHECK-NEXT: j .LBB11_4
-; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: beqz a4, .LBB29_3
+; CHECK-NEXT: j .LBB29_4
+; CHECK-NEXT: .LBB29_2:
; CHECK-NEXT: sltu a4, a0, a2
-; CHECK-NEXT: bnez a4, .LBB11_4
-; CHECK-NEXT: .LBB11_3:
+; CHECK-NEXT: bnez a4, .LBB29_4
+; CHECK-NEXT: .LBB29_3:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB11_4:
+; CHECK-NEXT: .LBB29_4:
; CHECK-NEXT: ret
%cmp = icmp slt i64 %a, %b
%cond = select i1 %cmp, i64 %a, i64 %b
@@ -495,10 +1164,10 @@ define i64 @min_i64(i64 %a, i64 %b) nounwind {
define i32 @max_i32(i32 %a, i32 %b) nounwind {
; RV32I-LABEL: max_i32:
; RV32I: # %bb.0:
-; RV32I-NEXT: blt a1, a0, .LBB12_2
+; RV32I-NEXT: blt a1, a0, .LBB30_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: .LBB12_2:
+; RV32I-NEXT: .LBB30_2:
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: max_i32:
@@ -518,18 +1187,18 @@ define i32 @max_i32(i32 %a, i32 %b) nounwind {
define i64 @max_i64(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: max_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: beq a1, a3, .LBB13_2
+; CHECK-NEXT: beq a1, a3, .LBB31_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: slt a4, a3, a1
-; CHECK-NEXT: beqz a4, .LBB13_3
-; CHECK-NEXT: j .LBB13_4
-; CHECK-NEXT: .LBB13_2:
+; CHECK-NEXT: beqz a4, .LBB31_3
+; CHECK-NEXT: j .LBB31_4
+; CHECK-NEXT: .LBB31_2:
; CHECK-NEXT: sltu a4, a2, a0
-; CHECK-NEXT: bnez a4, .LBB13_4
-; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: bnez a4, .LBB31_4
+; CHECK-NEXT: .LBB31_3:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB13_4:
+; CHECK-NEXT: .LBB31_4:
; CHECK-NEXT: ret
%cmp = icmp sgt i64 %a, %b
%cond = select i1 %cmp, i64 %a, i64 %b
@@ -539,10 +1208,10 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind {
define i32 @minu_i32(i32 %a, i32 %b) nounwind {
; RV32I-LABEL: minu_i32:
; RV32I: # %bb.0:
-; RV32I-NEXT: bltu a0, a1, .LBB14_2
+; RV32I-NEXT: bltu a0, a1, .LBB32_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: .LBB14_2:
+; RV32I-NEXT: .LBB32_2:
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: minu_i32:
@@ -562,18 +1231,18 @@ define i32 @minu_i32(i32 %a, i32 %b) nounwind {
define i64 @minu_i64(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: minu_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: beq a1, a3, .LBB15_2
+; CHECK-NEXT: beq a1, a3, .LBB33_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: sltu a4, a1, a3
-; CHECK-NEXT: beqz a4, .LBB15_3
-; CHECK-NEXT: j .LBB15_4
-; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: beqz a4, .LBB33_3
+; CHECK-NEXT: j .LBB33_4
+; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: sltu a4, a0, a2
-; CHECK-NEXT: bnez a4, .LBB15_4
-; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: bnez a4, .LBB33_4
+; CHECK-NEXT: .LBB33_3:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB15_4:
+; CHECK-NEXT: .LBB33_4:
; CHECK-NEXT: ret
%cmp = icmp ult i64 %a, %b
%cond = select i1 %cmp, i64 %a, i64 %b
@@ -583,10 +1252,10 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind {
define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
; RV32I-LABEL: maxu_i32:
; RV32I: # %bb.0:
-; RV32I-NEXT: bltu a1, a0, .LBB16_2
+; RV32I-NEXT: bltu a1, a0, .LBB34_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: mv a0, a1
-; RV32I-NEXT: .LBB16_2:
+; RV32I-NEXT: .LBB34_2:
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: maxu_i32:
@@ -606,18 +1275,18 @@ define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: maxu_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: beq a1, a3, .LBB17_2
+; CHECK-NEXT: beq a1, a3, .LBB35_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: sltu a4, a3, a1
-; CHECK-NEXT: beqz a4, .LBB17_3
-; CHECK-NEXT: j .LBB17_4
-; CHECK-NEXT: .LBB17_2:
+; CHECK-NEXT: beqz a4, .LBB35_3
+; CHECK-NEXT: j .LBB35_4
+; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: sltu a4, a2, a0
-; CHECK-NEXT: bnez a4, .LBB17_4
-; CHECK-NEXT: .LBB17_3:
+; CHECK-NEXT: bnez a4, .LBB35_4
+; CHECK-NEXT: .LBB35_3:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB17_4:
+; CHECK-NEXT: .LBB35_4:
; CHECK-NEXT: ret
%cmp = icmp ugt i64 %a, %b
%cond = select i1 %cmp, i64 %a, i64 %b
@@ -648,13 +1317,13 @@ declare i64 @llvm.abs.i64(i64, i1 immarg)
define i64 @abs_i64(i64 %x) {
; CHECK-LABEL: abs_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: bgez a1, .LBB19_2
+; CHECK-NEXT: bgez a1, .LBB37_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: snez a2, a0
; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: neg a1, a1
; CHECK-NEXT: sub a1, a1, a2
-; CHECK-NEXT: .LBB19_2:
+; CHECK-NEXT: .LBB37_2:
; CHECK-NEXT: ret
%abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
ret i64 %abs
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index e0a1dbd0af302ff..18f1574b5352679 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -578,6 +578,85 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
ret i32 %1
}
+define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
+; RV64I-LABEL: ctpop_i32_ult_two:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i32_ult_two:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp ult i32 %1, 2
+ ret i1 %2
+}
+
+define i1 @ctpop_i32_ugt_one(i32 signext %a) nounwind {
+; RV64I-LABEL: ctpop_i32_ugt_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i32_ugt_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: xori a0, a0, 1
+; RV64ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp ugt i32 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i32_eq_one(i32 signext %a) nounwind {
+; RV64I-LABEL: ctpop_i32_eq_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: and a1, a0, a1
+; RV64I-NEXT: seqz a1, a1
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i32_eq_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: seqz a0, a0
+; RV64ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp eq i32 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind {
+; RV64I-LABEL: ctpop_i32_ne_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: and a1, a0, a1
+; RV64I-NEXT: snez a1, a1
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i32_ne_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: snez a0, a0
+; RV64ZBB-NEXT: ret
+ %1 = call i32 @llvm.ctpop.i32(i32 %a)
+ %2 = icmp ne i32 %1, 1
+ ret i1 %2
+}
+
define signext i32 @ctpop_i32_load(ptr %p) nounwind {
; RV64I-LABEL: ctpop_i32_load:
; RV64I: # %bb.0:
@@ -618,6 +697,192 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
ret i32 %1
}
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
+
+define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: mv s0, a1
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 349525
+; RV64I-NEXT: addiw s3, a2, 1365
+; RV64I-NEXT: and a1, a1, s3
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: lui a1, 209715
+; RV64I-NEXT: addiw s4, a1, 819
+; RV64I-NEXT: and a1, a0, s4
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, s4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw s5, a1, -241
+; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: lui a1, 4112
+; RV64I-NEXT: addiw s1, a1, 257
+; RV64I-NEXT: mv a1, s1
+; RV64I-NEXT: call __muldi3 at plt
+; RV64I-NEXT: srliw s2, a0, 24
+; RV64I-NEXT: srli a0, s0, 1
+; RV64I-NEXT: and a0, a0, s3
+; RV64I-NEXT: sub s0, s0, a0
+; RV64I-NEXT: and a0, s0, s4
+; RV64I-NEXT: srli s0, s0, 2
+; RV64I-NEXT: and a1, s0, s4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: mv a1, s1
+; RV64I-NEXT: call __muldi3 at plt
+; RV64I-NEXT: srliw a1, a0, 24
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i32:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: cpopw a1, a1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ ret <2 x i32> %1
+}
+
+define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i32_ult_two:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addiw a2, a0, -1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: addiw a2, a1, -1
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: sext.w a1, a1
+; RV64I-NEXT: seqz a1, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i32_ult_two:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a1, a1
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: sltiu a1, a1, 2
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp ult <2 x i32> %1, <i32 2, i32 2>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i32_ugt_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addiw a2, a0, -1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: addiw a2, a1, -1
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: sext.w a1, a1
+; RV64I-NEXT: snez a1, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i32_ugt_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a1, a1
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: xori a0, a0, 1
+; RV64ZBB-NEXT: sltiu a1, a1, 2
+; RV64ZBB-NEXT: xori a1, a1, 1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp ugt <2 x i32> %1, <i32 1, i32 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i32_eq_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sext.w a2, a1
+; RV64I-NEXT: sext.w a3, a0
+; RV64I-NEXT: addiw a4, a0, -1
+; RV64I-NEXT: and a0, a0, a4
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: snez a3, a3
+; RV64I-NEXT: and a0, a3, a0
+; RV64I-NEXT: addiw a3, a1, -1
+; RV64I-NEXT: and a1, a1, a3
+; RV64I-NEXT: sext.w a1, a1
+; RV64I-NEXT: seqz a1, a1
+; RV64I-NEXT: snez a2, a2
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i32_eq_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a1, a1
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: seqz a0, a0
+; RV64ZBB-NEXT: addi a1, a1, -1
+; RV64ZBB-NEXT: seqz a1, a1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp eq <2 x i32> %1, <i32 1, i32 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i32_ne_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: sext.w a2, a1
+; RV64I-NEXT: sext.w a3, a0
+; RV64I-NEXT: addiw a4, a0, -1
+; RV64I-NEXT: and a0, a0, a4
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: seqz a3, a3
+; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: addiw a3, a1, -1
+; RV64I-NEXT: and a1, a1, a3
+; RV64I-NEXT: sext.w a1, a1
+; RV64I-NEXT: snez a1, a1
+; RV64I-NEXT: seqz a2, a2
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i32_ne_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpopw a1, a1
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: snez a0, a0
+; RV64ZBB-NEXT: addi a1, a1, -1
+; RV64ZBB-NEXT: snez a1, a1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
+ %2 = icmp ne <2 x i32> %1, <i32 1, i32 1>
+ ret <2 x i1> %2
+}
+
declare i64 @llvm.ctpop.i64(i64)
define i64 @ctpop_i64(i64 %a) nounwind {
@@ -665,6 +930,267 @@ define i64 @ctpop_i64(i64 %a) nounwind {
ret i64 %1
}
+define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
+; RV64I-LABEL: ctpop_i64_ugt_two:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i64_ugt_two:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp ult i64 %1, 2
+ ret i1 %2
+}
+
+define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
+; RV64I-LABEL: ctpop_i64_ugt_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i64_ugt_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: xori a0, a0, 1
+; RV64ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp ugt i64 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
+; RV64I-LABEL: ctpop_i64_eq_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a1, a0, a1
+; RV64I-NEXT: seqz a1, a1
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i64_eq_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: seqz a0, a0
+; RV64ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp eq i64 %1, 1
+ ret i1 %2
+}
+
+define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
+; RV64I-LABEL: ctpop_i64_ne_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a1, a0, a1
+; RV64I-NEXT: snez a1, a1
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_i64_ne_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: snez a0, a0
+; RV64ZBB-NEXT: ret
+ %1 = call i64 @llvm.ctpop.i64(i64 %a)
+ %2 = icmp ne i64 %1, 1
+ ret i1 %2
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
+
+define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: mv s0, a1
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 349525
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: slli a3, a2, 32
+; RV64I-NEXT: add s3, a2, a3
+; RV64I-NEXT: and a1, a1, s3
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: lui a1, 209715
+; RV64I-NEXT: addiw a1, a1, 819
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add s4, a1, a2
+; RV64I-NEXT: and a1, a0, s4
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, s4
+; RV64I-NEXT: add a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add s5, a1, a2
+; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: lui a1, 4112
+; RV64I-NEXT: addiw s1, a1, 257
+; RV64I-NEXT: slli a1, s1, 32
+; RV64I-NEXT: add s1, s1, a1
+; RV64I-NEXT: mv a1, s1
+; RV64I-NEXT: call __muldi3 at plt
+; RV64I-NEXT: srli s2, a0, 56
+; RV64I-NEXT: srli a0, s0, 1
+; RV64I-NEXT: and a0, a0, s3
+; RV64I-NEXT: sub s0, s0, a0
+; RV64I-NEXT: and a0, s0, s4
+; RV64I-NEXT: srli s0, s0, 2
+; RV64I-NEXT: and a1, s0, s4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: mv a1, s1
+; RV64I-NEXT: call __muldi3 at plt
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i64:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: cpop a1, a1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ ret <2 x i64> %1
+}
+
+define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i64_ult_two:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: addi a2, a1, -1
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: seqz a1, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i64_ult_two:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a1, a1
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: sltiu a1, a1, 2
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp ult <2 x i64> %1, <i64 2, i64 2>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i64_ugt_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: addi a2, a1, -1
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: snez a1, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i64_ugt_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a1, a1
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: xori a0, a0, 1
+; RV64ZBB-NEXT: sltiu a1, a1, 2
+; RV64ZBB-NEXT: xori a1, a1, 1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i64_eq_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: seqz a2, a2
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: addi a2, a1, -1
+; RV64I-NEXT: and a2, a1, a2
+; RV64I-NEXT: seqz a2, a2
+; RV64I-NEXT: snez a1, a1
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i64_eq_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a1, a1
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: seqz a0, a0
+; RV64ZBB-NEXT: addi a1, a1, -1
+; RV64ZBB-NEXT: seqz a1, a1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
+ ret <2 x i1> %2
+}
+
+define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
+; RV64I-LABEL: ctpop_v2i64_ne_one:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: snez a2, a2
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: addi a2, a1, -1
+; RV64I-NEXT: and a2, a1, a2
+; RV64I-NEXT: snez a2, a2
+; RV64I-NEXT: seqz a1, a1
+; RV64I-NEXT: or a1, a1, a2
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop_v2i64_ne_one:
+; RV64ZBB: # %bb.0:
+; RV64ZBB-NEXT: cpop a1, a1
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: snez a0, a0
+; RV64ZBB-NEXT: addi a1, a1, -1
+; RV64ZBB-NEXT: snez a1, a1
+; RV64ZBB-NEXT: ret
+ %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ %2 = icmp ne <2 x i64> %1, <i64 1, i64 1>
+ ret <2 x i1> %2
+}
+
define signext i32 @sextb_i32(i32 signext %a) nounwind {
; RV64I-LABEL: sextb_i32:
; RV64I: # %bb.0:
@@ -732,10 +1258,10 @@ define i64 @sexth_i64(i64 %a) nounwind {
define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: min_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: blt a0, a1, .LBB18_2
+; RV64I-NEXT: blt a0, a1, .LBB36_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB18_2:
+; RV64I-NEXT: .LBB36_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: min_i32:
@@ -750,10 +1276,10 @@ define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
define i64 @min_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: min_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: blt a0, a1, .LBB19_2
+; RV64I-NEXT: blt a0, a1, .LBB37_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB19_2:
+; RV64I-NEXT: .LBB37_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: min_i64:
@@ -768,10 +1294,10 @@ define i64 @min_i64(i64 %a, i64 %b) nounwind {
define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: max_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: blt a1, a0, .LBB20_2
+; RV64I-NEXT: blt a1, a0, .LBB38_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB20_2:
+; RV64I-NEXT: .LBB38_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: max_i32:
@@ -786,10 +1312,10 @@ define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
define i64 @max_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: max_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: blt a1, a0, .LBB21_2
+; RV64I-NEXT: blt a1, a0, .LBB39_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB21_2:
+; RV64I-NEXT: .LBB39_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: max_i64:
@@ -804,10 +1330,10 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind {
define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: minu_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: bltu a0, a1, .LBB22_2
+; RV64I-NEXT: bltu a0, a1, .LBB40_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB22_2:
+; RV64I-NEXT: .LBB40_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: minu_i32:
@@ -822,10 +1348,10 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
define i64 @minu_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: minu_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: bltu a0, a1, .LBB23_2
+; RV64I-NEXT: bltu a0, a1, .LBB41_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB23_2:
+; RV64I-NEXT: .LBB41_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: minu_i64:
@@ -840,10 +1366,10 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind {
define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: maxu_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: bltu a1, a0, .LBB24_2
+; RV64I-NEXT: bltu a1, a0, .LBB42_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB24_2:
+; RV64I-NEXT: .LBB42_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: maxu_i32:
@@ -858,10 +1384,10 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: maxu_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: bltu a1, a0, .LBB25_2
+; RV64I-NEXT: bltu a1, a0, .LBB43_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: .LBB25_2:
+; RV64I-NEXT: .LBB43_2:
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: maxu_i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index 1a253bb209f1dba..ef0a293ad5fb9ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -857,6 +857,92 @@ define <vscale x 16 x i32> @ctpop_nxv16i32(<vscale x 16 x i32> %va) {
%a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
ret <vscale x 16 x i32> %a
}
+
+; We always emit vcpop.v for the scalable vector
+define <vscale x 16 x i1> @ctpop_nxv16i32_ult_two(<vscale x 16 x i32> %va) {
+; CHECK-LABEL: ctpop_nxv16i32_ult_two:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v8, v8, v16
+; CHECK-NEXT: vmseq.vi v0, v8, 0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv16i32_ult_two:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmsleu.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
+ %cmp = icmp ult <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+ ret <vscale x 16 x i1> %cmp
+}
+
+define <vscale x 16 x i1> @ctpop_nxv16i32_ugt_one(<vscale x 16 x i32> %va) {
+; CHECK-LABEL: ctpop_nxv16i32_ugt_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v8, v8, v16
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv16i32_ugt_one:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmsgtu.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
+ %cmp = icmp ugt <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+ ret <vscale x 16 x i1> %cmp
+}
+
+define <vscale x 16 x i1> @ctpop_nxv16i32_eq_one(<vscale x 16 x i32> %va) {
+; CHECK-LABEL: ctpop_nxv16i32_eq_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v16, v8, v16
+; CHECK-NEXT: vmseq.vi v24, v16, 0
+; CHECK-NEXT: vmsne.vi v16, v8, 0
+; CHECK-NEXT: vmand.mm v0, v16, v24
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv16i32_eq_one:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmseq.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
+ %cmp = icmp eq <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+ ret <vscale x 16 x i1> %cmp
+}
+
+define <vscale x 16 x i1> @ctpop_nxv16i32_ne_one(<vscale x 16 x i32> %va) {
+; CHECK-LABEL: ctpop_nxv16i32_ne_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v16, v8, v16
+; CHECK-NEXT: vmsne.vi v24, v16, 0
+; CHECK-NEXT: vmseq.vi v16, v8, 0
+; CHECK-NEXT: vmor.mm v0, v16, v24
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv16i32_ne_one:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmsne.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
+ %cmp = icmp ne <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+ ret <vscale x 16 x i1> %cmp
+}
+
declare <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32>)
define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
@@ -1189,4 +1275,90 @@ define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
%a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
ret <vscale x 8 x i64> %a
}
+
+; We always emit vcpop.v for the scalable vector
+define <vscale x 8 x i1> @ctpop_nxv8i64_ult_two(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: ctpop_nxv8i64_ult_two:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v8, v8, v16
+; CHECK-NEXT: vmseq.vi v0, v8, 0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv8i64_ult_two:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmsleu.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
+ %cmp = icmp ult <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 2, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+ ret <vscale x 8 x i1> %cmp
+}
+
+define <vscale x 8 x i1> @ctpop_nxv8i64_ugt_one(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: ctpop_nxv8i64_ugt_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v8, v8, v16
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv8i64_ugt_one:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmsgtu.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
+ %cmp = icmp ugt <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+ ret <vscale x 8 x i1> %cmp
+}
+
+define <vscale x 8 x i1> @ctpop_nxv8i64_eq_one(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: ctpop_nxv8i64_eq_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v16, v8, v16
+; CHECK-NEXT: vmseq.vi v24, v16, 0
+; CHECK-NEXT: vmsne.vi v16, v8, 0
+; CHECK-NEXT: vmand.mm v0, v16, v24
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv8i64_eq_one:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmseq.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
+ %cmp = icmp eq <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+ ret <vscale x 8 x i1> %cmp
+}
+
+define <vscale x 8 x i1> @ctpop_nxv8i64_ne_one(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: ctpop_nxv8i64_ne_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v8, -1
+; CHECK-NEXT: vand.vv v16, v8, v16
+; CHECK-NEXT: vmsne.vi v24, v16, 0
+; CHECK-NEXT: vmseq.vi v16, v8, 0
+; CHECK-NEXT: vmor.mm v0, v16, v24
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: ctpop_nxv8i64_ne_one:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT: vcpop.v v8, v8
+; CHECK-ZVBB-NEXT: vmsne.vi v0, v8, 1
+; CHECK-ZVBB-NEXT: ret
+ %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
+ %cmp = icmp ne <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+ ret <vscale x 8 x i1> %cmp
+}
+
declare <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index 383d5110ecceb92..c7b6db226ee5f6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -824,6 +824,222 @@ define void @ctpop_v8i32(ptr %x, ptr %y) {
store <8 x i32> %c, ptr %x
ret void
}
+define <8 x i1> @ctpop_v8i32_ult_two(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v8i32_ult_two:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-NEXT: vle32.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: ctpop_v8i32_ult_two:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vle32.v v8, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vadd.vi v10, v8, -1
+; LMULMAX1-NEXT: vand.vv v8, v8, v10
+; LMULMAX1-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v8, 0
+; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-NEXT: vmseq.vi v0, v9, 0
+; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v9, 0
+; LMULMAX1-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslideup.vi v8, v9, 4
+; LMULMAX1-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v8i32_ult_two:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmsleu.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <8 x i32>, ptr %x
+ %b = load <8 x i32>, ptr %y
+ %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+ %cmp = icmp ult <8 x i32> %c, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i1> %cmp
+}
+define <8 x i1> @ctpop_v8i32_ugt_one(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v8i32_ugt_one:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-NEXT: vle32.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: ctpop_v8i32_ugt_one:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vle32.v v8, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vadd.vi v10, v8, -1
+; LMULMAX1-NEXT: vand.vv v8, v8, v10
+; LMULMAX1-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v8, 0
+; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v9, 0
+; LMULMAX1-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslideup.vi v8, v9, 4
+; LMULMAX1-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v8i32_ugt_one:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmsgtu.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <8 x i32>, ptr %x
+ %b = load <8 x i32>, ptr %y
+ %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+ %cmp = icmp ugt <8 x i32> %c, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i1> %cmp
+}
+define <8 x i1> @ctpop_v8i32_eq_one(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v8i32_eq_one:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-NEXT: vle32.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-NEXT: vmseq.vi v12, v10, 0
+; LMULMAX2-NEXT: vmsne.vi v10, v8, 0
+; LMULMAX2-NEXT: vmand.mm v0, v10, v12
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: ctpop_v8i32_eq_one:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vle32.v v8, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v10, 0
+; LMULMAX1-NEXT: vmerge.vim v11, v10, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v12, 0
+; LMULMAX1-NEXT: vmerge.vim v13, v12, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslideup.vi v11, v13, 4
+; LMULMAX1-NEXT: vmsne.vi v11, v11, 0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vadd.vi v13, v8, -1
+; LMULMAX1-NEXT: vand.vv v8, v8, v13
+; LMULMAX1-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vmerge.vim v8, v10, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-NEXT: vmseq.vi v0, v9, 0
+; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vmerge.vim v9, v12, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslideup.vi v8, v9, 4
+; LMULMAX1-NEXT: vmsne.vi v8, v8, 0
+; LMULMAX1-NEXT: vmand.mm v0, v11, v8
+; LMULMAX1-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v8i32_eq_one:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmseq.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <8 x i32>, ptr %x
+ %b = load <8 x i32>, ptr %y
+ %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+ %cmp = icmp eq <8 x i32> %c, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i1> %cmp
+}
+define <8 x i1> @ctpop_v8i32_ne_one(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v8i32_ne_one:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-NEXT: vle32.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-NEXT: vmsne.vi v12, v10, 0
+; LMULMAX2-NEXT: vmseq.vi v10, v8, 0
+; LMULMAX2-NEXT: vmor.mm v0, v10, v12
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: ctpop_v8i32_ne_one:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vle32.v v8, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v10, 0
+; LMULMAX1-NEXT: vmerge.vim v11, v10, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vmseq.vi v0, v9, 0
+; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vmv.v.i v12, 0
+; LMULMAX1-NEXT: vmerge.vim v13, v12, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslideup.vi v11, v13, 4
+; LMULMAX1-NEXT: vmsne.vi v11, v11, 0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vadd.vi v13, v8, -1
+; LMULMAX1-NEXT: vand.vv v8, v8, v13
+; LMULMAX1-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vmerge.vim v8, v10, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vmerge.vim v9, v12, 1, v0
+; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslideup.vi v8, v9, 4
+; LMULMAX1-NEXT: vmsne.vi v8, v8, 0
+; LMULMAX1-NEXT: vmor.mm v0, v11, v8
+; LMULMAX1-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v8i32_ne_one:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmsne.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <8 x i32>, ptr %x
+ %b = load <8 x i32>, ptr %y
+ %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+ %cmp = icmp ne <8 x i32> %c, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i1> %cmp
+}
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
define void @ctpop_v4i64(ptr %x, ptr %y) {
@@ -1022,4 +1238,362 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
store <4 x i64> %c, ptr %x
ret void
}
+define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v4i64_ult_two:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-NEXT: vle64.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-RV32-LABEL: ctpop_v4i64_ult_two:
+; LMULMAX1-RV32: # %bb.0:
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
+; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v11, v9, v10
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmseq.vv v0, v9, v11
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v10, v8, v10
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT: vmseq.vv v0, v8, v11
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v8, v8, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2
+; LMULMAX1-RV32-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-RV32-NEXT: ret
+;
+; LMULMAX1-RV64-LABEL: ctpop_v4i64_ult_two:
+; LMULMAX1-RV64: # %bb.0:
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX1-RV64-NEXT: addi a0, a0, 16
+; LMULMAX1-RV64-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV64-NEXT: vadd.vi v10, v8, -1
+; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10
+; LMULMAX1-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v8, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v8, v8, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-RV64-NEXT: vmseq.vi v0, v9, 0
+; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v9, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v4i64_ult_two:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; ZVBB-NEXT: vle64.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmsleu.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <4 x i64>, ptr %x
+ %b = load <4 x i64>, ptr %y
+ %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+ %cmp = icmp ult <4 x i64> %c, <i64 2, i64 2, i64 2, i64 2>
+ ret <4 x i1> %cmp
+}
+define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v4i64_ugt_one:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-NEXT: vle64.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-RV32-LABEL: ctpop_v4i64_ugt_one:
+; LMULMAX1-RV32: # %bb.0:
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
+; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v11, v9, v10
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmsne.vv v0, v9, v11
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v10, v8, v10
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT: vmsne.vv v0, v8, v11
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v8, v8, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2
+; LMULMAX1-RV32-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-RV32-NEXT: ret
+;
+; LMULMAX1-RV64-LABEL: ctpop_v4i64_ugt_one:
+; LMULMAX1-RV64: # %bb.0:
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX1-RV64-NEXT: addi a0, a0, 16
+; LMULMAX1-RV64-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV64-NEXT: vadd.vi v10, v8, -1
+; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v8, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v8, v8, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v9, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v4i64_ugt_one:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; ZVBB-NEXT: vle64.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmsgtu.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <4 x i64>, ptr %x
+ %b = load <4 x i64>, ptr %y
+ %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+ %cmp = icmp ugt <4 x i64> %c, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i1> %cmp
+}
+define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v4i64_eq_one:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-NEXT: vle64.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-NEXT: vmseq.vi v12, v10, 0
+; LMULMAX2-NEXT: vmsne.vi v10, v8, 0
+; LMULMAX2-NEXT: vmand.mm v0, v10, v12
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-RV32-LABEL: ctpop_v4i64_eq_one:
+; LMULMAX1-RV32: # %bb.0:
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
+; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmsne.vv v0, v9, v10
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v12, v11, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmsne.vv v0, v8, v10
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v13, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v14, v13, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vslideup.vi v12, v14, 2
+; LMULMAX1-RV32-NEXT: vmsne.vi v12, v12, 0
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v14, -1
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v15, v9, v14
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15
+; LMULMAX1-RV32-NEXT: vmseq.vv v0, v9, v10
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vmerge.vim v9, v11, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v11, v8, v14
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11
+; LMULMAX1-RV32-NEXT: vmseq.vv v0, v8, v10
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV32-NEXT: vmerge.vim v8, v13, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2
+; LMULMAX1-RV32-NEXT: vmsne.vi v8, v9, 0
+; LMULMAX1-RV32-NEXT: vmand.mm v0, v12, v8
+; LMULMAX1-RV32-NEXT: ret
+;
+; LMULMAX1-RV64-LABEL: ctpop_v4i64_eq_one:
+; LMULMAX1-RV64: # %bb.0:
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX1-RV64-NEXT: addi a0, a0, 16
+; LMULMAX1-RV64-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v10, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v11, v10, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v12, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v13, v12, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vslideup.vi v11, v13, 2
+; LMULMAX1-RV64-NEXT: vmsne.vi v11, v11, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vadd.vi v13, v8, -1
+; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v13
+; LMULMAX1-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vmerge.vim v8, v10, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-RV64-NEXT: vmseq.vi v0, v9, 0
+; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV64-NEXT: vmerge.vim v9, v12, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2
+; LMULMAX1-RV64-NEXT: vmsne.vi v8, v8, 0
+; LMULMAX1-RV64-NEXT: vmand.mm v0, v11, v8
+; LMULMAX1-RV64-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v4i64_eq_one:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; ZVBB-NEXT: vle64.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmseq.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <4 x i64>, ptr %x
+ %b = load <4 x i64>, ptr %y
+ %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+ %cmp = icmp eq <4 x i64> %c, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i1> %cmp
+}
+define <4 x i1> @ctpop_v4i64_ne_one(ptr %x, ptr %y) {
+; LMULMAX2-LABEL: ctpop_v4i64_ne_one:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-NEXT: vle64.v v8, (a0)
+; LMULMAX2-NEXT: vadd.vi v10, v8, -1
+; LMULMAX2-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-NEXT: vmsne.vi v12, v10, 0
+; LMULMAX2-NEXT: vmseq.vi v10, v8, 0
+; LMULMAX2-NEXT: vmor.mm v0, v10, v12
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-RV32-LABEL: ctpop_v4i64_ne_one:
+; LMULMAX1-RV32: # %bb.0:
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
+; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmseq.vv v0, v9, v10
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v12, v11, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmseq.vv v0, v8, v10
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v13, 0
+; LMULMAX1-RV32-NEXT: vmerge.vim v14, v13, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vslideup.vi v12, v14, 2
+; LMULMAX1-RV32-NEXT: vmsne.vi v12, v12, 0
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vmv.v.i v14, -1
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v15, v9, v14
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15
+; LMULMAX1-RV32-NEXT: vmsne.vv v0, v9, v10
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vmerge.vim v9, v11, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vadd.vv v11, v8, v14
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11
+; LMULMAX1-RV32-NEXT: vmsne.vv v0, v8, v10
+; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV32-NEXT: vmerge.vim v8, v13, 1, v0
+; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2
+; LMULMAX1-RV32-NEXT: vmsne.vi v8, v9, 0
+; LMULMAX1-RV32-NEXT: vmor.mm v0, v12, v8
+; LMULMAX1-RV32-NEXT: ret
+;
+; LMULMAX1-RV64-LABEL: ctpop_v4i64_ne_one:
+; LMULMAX1-RV64: # %bb.0:
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX1-RV64-NEXT: addi a0, a0, 16
+; LMULMAX1-RV64-NEXT: vle64.v v9, (a0)
+; LMULMAX1-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v10, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v11, v10, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vmseq.vi v0, v9, 0
+; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV64-NEXT: vmv.v.i v12, 0
+; LMULMAX1-RV64-NEXT: vmerge.vim v13, v12, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vslideup.vi v11, v13, 2
+; LMULMAX1-RV64-NEXT: vmsne.vi v11, v11, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vadd.vi v13, v8, -1
+; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v13
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vmerge.vim v8, v10, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1
+; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-RV64-NEXT: vmsne.vi v0, v9, 0
+; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; LMULMAX1-RV64-NEXT: vmerge.vim v9, v12, 1, v0
+; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2
+; LMULMAX1-RV64-NEXT: vmsne.vi v8, v8, 0
+; LMULMAX1-RV64-NEXT: vmor.mm v0, v11, v8
+; LMULMAX1-RV64-NEXT: ret
+;
+; ZVBB-LABEL: ctpop_v4i64_ne_one:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; ZVBB-NEXT: vle64.v v8, (a0)
+; ZVBB-NEXT: vcpop.v v8, v8
+; ZVBB-NEXT: vmsne.vi v0, v8, 1
+; ZVBB-NEXT: ret
+ %a = load <4 x i64>, ptr %x
+ %b = load <4 x i64>, ptr %y
+ %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+ %cmp = icmp ne <4 x i64> %c, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i1> %cmp
+}
declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
More information about the llvm-commits
mailing list