[llvm] [RISCV] Construct constants via instructions if materialization is costly (PR #86926)
Wang Pengcheng via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 28 02:57:06 PDT 2024
https://github.com/wangpc-pp created https://github.com/llvm/llvm-project/pull/86926
For RISCV, it is costly to materialize constants used in lowering
`ISD::CTPOP`/`ISD::VP_CTPOP`.
We can query the materialization cost via `RISCVMatInt::getIntMatCost`
and if the cost is larger than 2, we should construct the constant
via two instructions.
This fixes #86207.
>From dabc3a5d5d3be0ddada58d8ace8a48cd2ffc3b43 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Thu, 28 Mar 2024 17:23:24 +0800
Subject: [PATCH] [RISCV] Construct constants via instructions if
materialization is costly
For RISCV, it is costly to materialize constants used in lowering
`ISD::CTPOP`/`ISD::VP_CTPOP`.
We can query the materialization cost via `RISCVMatInt::getIntMatCost`
and if the cost is larger than 2, we should construct the constant
via two instructions.
This fixes #86207.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 201 +-
llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 +
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 1608 ++++--
.../CodeGen/RISCV/ctz_zero_return_test.ll | 107 +-
llvm/test/CodeGen/RISCV/pr56457.ll | 24 +-
.../RISCV/rv64-legal-i32/rv64xtheadbb.ll | 57 +-
.../CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll | 95 +-
llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 187 +-
llvm/test/CodeGen/RISCV/rv64zbb.ll | 383 +-
llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll | 664 ++-
llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll | 372 +-
llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll | 1946 ++++---
llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll | 836 ++-
llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll | 2134 ++++----
.../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 4386 +++++++++-------
.../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll | 280 +-
.../RISCV/rvv/fixed-vectors-ctpop-vp.ll | 2059 +++++---
.../CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll | 164 +-
.../RISCV/rvv/fixed-vectors-cttz-vp.ll | 4566 ++++++++++-------
.../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll | 392 +-
llvm/test/CodeGen/RISCV/sextw-removal.ll | 58 +-
21 files changed, 11809 insertions(+), 8713 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e6814c5f71a09b..031030990d4405 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -391,7 +391,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
}
} else if (!Subtarget.hasVendorXCVbitmanip()) {
- setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
+ setOperationAction(ISD::CTTZ, XLenVT, Expand);
+ setOperationAction(ISD::CTPOP, XLenVT,
+ Subtarget.is64Bit() ? Custom : Expand);
if (RV64LegalI32 && Subtarget.is64Bit())
setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
}
@@ -901,11 +903,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
VT, Custom);
} else {
setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand);
- setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand);
+ setOperationAction({ISD::CTLZ, ISD::CTTZ}, VT, Expand);
setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
- ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
+ ISD::VP_CTTZ_ZERO_UNDEF},
VT, Expand);
+ setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom);
+
// Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
// range of f32.
EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
@@ -1238,6 +1242,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP},
VT, Custom);
} else {
+ setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom);
// Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
// range of f32.
EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
@@ -6746,8 +6751,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::UDIV:
case ISD::UREM:
case ISD::BSWAP:
- case ISD::CTPOP:
return lowerToScalableOp(Op, DAG);
+ case ISD::CTPOP: {
+ if (Op.getValueType().isScalarInteger())
+ return lowerScalarCTPOP(Op, DAG);
+ if (Subtarget.hasStdExtZvbb())
+ return lowerToScalableOp(Op, DAG);
+ return lowerVectorCTPOP(Op, DAG);
+ }
+ case ISD::VP_CTPOP:
+ if (Subtarget.hasStdExtZvbb())
+ return lowerVPOp(Op, DAG);
+ return lowerVectorCTPOP(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -6972,8 +6987,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
if (Subtarget.hasStdExtZvbb())
return lowerVPOp(Op, DAG);
return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
- case ISD::VP_CTPOP:
- return lowerVPOp(Op, DAG);
case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
return lowerVPStridedLoad(Op, DAG);
case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
@@ -10755,6 +10768,182 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
return Max;
}
+SDValue RISCVTargetLowering::lowerScalarCTPOP(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ MVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()).getSimpleVT();
+ unsigned Len = VT.getScalarSizeInBits();
+ assert(VT.isInteger() && "lowerScalarCTPOP not implemented for this type.");
+
+ SDValue V = Op.getOperand(0);
+
+ // This is same algorithm of TargetLowering::expandCTPOP from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ // 0x0F0F0F0F...
+ const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+ SDValue Mask0F = DAG.getConstant(Constant0F, DL, VT, false, true);
+ // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+ const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
+ SDValue Mask33 =
+ RISCVMatInt::getIntMatCost(Constant33, VT.getScalarSizeInBits(),
+ Subtarget) > 2
+ ? DAG.getNode(ISD::XOR, DL, VT, Mask0F,
+ DAG.getNode(ISD::SHL, DL, VT, Mask0F,
+ DAG.getShiftAmountConstant(2, VT, DL)))
+ : DAG.getConstant(Constant33, DL, VT);
+ // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+ const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+ SDValue Mask55 =
+ RISCVMatInt::getIntMatCost(Constant55, VT.getScalarSizeInBits(),
+ Subtarget) > 2
+ ? DAG.getNode(ISD::XOR, DL, VT, Mask33,
+ DAG.getNode(ISD::SHL, DL, VT, Mask33,
+ DAG.getShiftAmountConstant(1, VT, DL)))
+ : DAG.getConstant(Constant55, DL, VT);
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ V = DAG.getNode(ISD::SUB, DL, VT, V,
+ DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(ISD::SRL, DL, VT, V,
+ DAG.getConstant(1, DL, ShVT)),
+ Mask55));
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ V = DAG.getNode(ISD::ADD, DL, VT, DAG.getNode(ISD::AND, DL, VT, V, Mask33),
+ DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(ISD::SRL, DL, VT, V,
+ DAG.getConstant(2, DL, ShVT)),
+ Mask33));
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ V = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(ISD::ADD, DL, VT, V,
+ DAG.getNode(ISD::SRL, DL, VT, V,
+ DAG.getConstant(4, DL, ShVT))),
+ Mask0F);
+
+ // v = (v * 0x01010101...) >> (Len - 8)
+ // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+ const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
+ SDValue Mask01 =
+ RISCVMatInt::getIntMatCost(Constant01, VT.getScalarSizeInBits(),
+ Subtarget) > 2
+ ? DAG.getNode(ISD::AND, DL, VT, Mask0F,
+ DAG.getNode(ISD::SRL, DL, VT, Mask0F,
+ DAG.getShiftAmountConstant(3, VT, DL)))
+ : DAG.getConstant(Constant01, DL, VT);
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, V, Mask01),
+ DAG.getConstant(Len - 8, DL, ShVT));
+}
+
+SDValue RISCVTargetLowering::lowerVectorCTPOP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ unsigned Len = VT.getScalarSizeInBits();
+ assert(VT.isInteger() && "lowerVectorCTPOP not implemented for this type.");
+
+ SDValue V = Op.getOperand(0);
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ V = convertToScalableVector(ContainerVT, V, DAG, Subtarget);
+ }
+
+ SDValue Mask, VL;
+ if (Op->getOpcode() == ISD::VP_CTPOP) {
+ Mask = Op->getOperand(1);
+ if (VT.isFixedLengthVector())
+ Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
+ Subtarget);
+ VL = Op->getOperand(2);
+ } else
+ std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+ // This is same algorithm of TargetLowering::expandVPCTPOP from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+ // 0x0F0F0F0F...
+ const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+ SDValue Mask0F = DAG.getConstant(Constant0F, DL, ContainerVT);
+ // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+ const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
+ SDValue Mask33 =
+ RISCVMatInt::getIntMatCost(Constant33, ContainerVT.getScalarSizeInBits(),
+ Subtarget) > 2
+ ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask0F,
+ DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask0F,
+ DAG.getConstant(2, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ DAG.getUNDEF(ContainerVT), Mask, VL)
+ : DAG.getConstant(Constant33, DL, ContainerVT);
+ // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+ const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+ SDValue Mask55 =
+ RISCVMatInt::getIntMatCost(Constant55, ContainerVT.getScalarSizeInBits(),
+ Subtarget) > 2
+ ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask33,
+ DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask33,
+ DAG.getConstant(1, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ DAG.getUNDEF(ContainerVT), Mask, VL)
+ : DAG.getConstant(Constant55, DL, ContainerVT);
+
+ SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5;
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ Tmp1 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT,
+ DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+ DAG.getConstant(1, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ Mask55, DAG.getUNDEF(ContainerVT), Mask, VL);
+ V = DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, V, Tmp1,
+ DAG.getUNDEF(ContainerVT), Mask, VL);
+
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ Tmp2 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, V, Mask33,
+ DAG.getUNDEF(ContainerVT), Mask, VL);
+ Tmp3 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT,
+ DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+ DAG.getConstant(2, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ Mask33, DAG.getUNDEF(ContainerVT), Mask, VL);
+ V = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, Tmp2, Tmp3,
+ DAG.getUNDEF(ContainerVT), Mask, VL);
+
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ Tmp4 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+ DAG.getConstant(4, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ Tmp5 = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, V, Tmp4,
+ DAG.getUNDEF(ContainerVT), Mask, VL);
+ V = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Tmp5, Mask0F,
+ DAG.getUNDEF(ContainerVT), Mask, VL);
+
+ if (Len > 8) {
+ // v = (v * 0x01010101...) >> (Len - 8)
+ // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+ const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
+ SDValue Mask01 =
+ RISCVMatInt::getIntMatCost(
+ Constant01, ContainerVT.getScalarSizeInBits(), Subtarget) > 2
+ ? DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Mask0F,
+ DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Mask0F,
+ DAG.getConstant(3, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ DAG.getUNDEF(ContainerVT), Mask, VL)
+ : DAG.getConstant(Constant01, DL, ContainerVT);
+ V = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT,
+ DAG.getNode(RISCVISD::MUL_VL, DL, ContainerVT, V, Mask01,
+ DAG.getUNDEF(ContainerVT), Mask, VL),
+ DAG.getConstant(Len - 8, DL, ContainerVT),
+ DAG.getUNDEF(ContainerVT), Mask, VL);
+ }
+
+ if (VT.isFixedLengthVector())
+ V = convertFromScalableVector(VT, V, DAG, Subtarget);
+ return V;
+}
+
SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV(
SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index c11b1464757c7f..cc8a18d9088106 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -959,6 +959,9 @@ class RISCVTargetLowering : public TargetLowering {
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerScalarCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVectorCTPOP(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b396..1eaf91096336f3 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -53,28 +53,77 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
; RV32_NOZBB-NEXT: li a0, 8
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_cttz_i8:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: andi a1, a0, 255
-; RV64NOZBB-NEXT: beqz a1, .LBB0_2
-; RV64NOZBB-NEXT: # %bb.1: # %cond.false
-; RV64NOZBB-NEXT: addi a1, a0, -1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: andi a0, a0, 51
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a0, a0, 15
-; RV64NOZBB-NEXT: ret
-; RV64NOZBB-NEXT: .LBB0_2:
-; RV64NOZBB-NEXT: li a0, 8
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_cttz_i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: beqz a1, .LBB0_2
+; RV64I-NEXT: # %bb.1: # %cond.false
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: andi a3, a0, 255
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: andi a0, a0, 85
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: and a0, a3, a2
+; RV64I-NEXT: srli a3, a3, 2
+; RV64I-NEXT: and a2, a3, a2
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB0_2:
+; RV64I-NEXT: li a0, 8
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_cttz_i8:
+; RV64M: # %bb.0:
+; RV64M-NEXT: andi a1, a0, 255
+; RV64M-NEXT: beqz a1, .LBB0_2
+; RV64M-NEXT: # %bb.1: # %cond.false
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: addi a3, a0, -1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: andi a3, a0, 255
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: andi a0, a0, 85
+; RV64M-NEXT: sub a3, a3, a0
+; RV64M-NEXT: and a0, a3, a2
+; RV64M-NEXT: srli a3, a3, 2
+; RV64M-NEXT: and a2, a3, a2
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
+; RV64M-NEXT: .LBB0_2:
+; RV64M-NEXT: li a0, 8
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i8:
; RV32ZBB: # %bb.0:
@@ -154,35 +203,83 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: li a0, 16
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_cttz_i16:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: beqz a1, .LBB1_2
-; RV64NOZBB-NEXT: # %bb.1: # %cond.false
-; RV64NOZBB-NEXT: addi a1, a0, -1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: lui a2, 5
-; RV64NOZBB-NEXT: addiw a2, a2, 1365
-; RV64NOZBB-NEXT: and a1, a1, a2
-; RV64NOZBB-NEXT: sub a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 3
-; RV64NOZBB-NEXT: addiw a1, a1, 819
-; RV64NOZBB-NEXT: and a2, a0, a1
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: add a0, a2, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 15
-; RV64NOZBB-NEXT: slli a0, a0, 52
-; RV64NOZBB-NEXT: srli a0, a0, 60
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: ret
-; RV64NOZBB-NEXT: .LBB1_2:
-; RV64NOZBB-NEXT: li a0, 16
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_cttz_i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: beqz a1, .LBB1_2
+; RV64I-NEXT: # %bb.1: # %cond.false
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 5
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: slli a0, a0, 48
+; RV64I-NEXT: srli a0, a0, 48
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB1_2:
+; RV64I-NEXT: li a0, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_cttz_i16:
+; RV64M: # %bb.0:
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: beqz a1, .LBB1_2
+; RV64M-NEXT: # %bb.1: # %cond.false
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: addi a3, a0, -1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 5
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: slli a0, a0, 48
+; RV64M-NEXT: srli a0, a0, 48
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
+; RV64M-NEXT: .LBB1_2:
+; RV64M-NEXT: li a0, 16
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i16:
; RV32ZBB: # %bb.0:
@@ -422,16 +519,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV64I-NEXT: # %bb.1: # %cond.false
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: neg a1, a0
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, %hi(.LCPI3_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI3_0)(a1)
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI3_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI3_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -475,16 +589,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV64M: # %bb.0:
; RV64M-NEXT: beqz a0, .LBB3_2
; RV64M-NEXT: # %bb.1: # %cond.false
-; RV64M-NEXT: lui a1, %hi(.LCPI3_0)
-; RV64M-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; RV64M-NEXT: neg a2, a0
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: addi a3, a0, -1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 349525
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: slli a5, a4, 32
+; RV64M-NEXT: add a4, a4, a5
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srli a0, a0, 58
-; RV64M-NEXT: lui a1, %hi(.LCPI3_1)
-; RV64M-NEXT: addi a1, a1, %lo(.LCPI3_1)
-; RV64M-NEXT: add a0, a1, a0
-; RV64M-NEXT: lbu a0, 0(a0)
+; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
; RV64M-NEXT: .LBB3_2:
; RV64M-NEXT: li a0, 64
@@ -565,22 +696,65 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
; RV32_NOZBB-NEXT: andi a0, a0, 15
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_cttz_i8_zero_undef:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: addi a1, a0, -1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: andi a0, a0, 51
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a0, a0, 15
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_cttz_i8_zero_undef:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: andi a3, a0, 255
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: andi a0, a0, 85
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: and a0, a3, a2
+; RV64I-NEXT: srli a3, a3, 2
+; RV64I-NEXT: and a2, a3, a2
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_cttz_i8_zero_undef:
+; RV64M: # %bb.0:
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: addi a3, a0, -1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: andi a3, a0, 255
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: andi a0, a0, 85
+; RV64M-NEXT: sub a3, a3, a0
+; RV64M-NEXT: and a0, a3, a2
+; RV64M-NEXT: srli a3, a3, 2
+; RV64M-NEXT: and a2, a3, a2
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
; RV32ZBB: # %bb.0:
@@ -640,29 +814,71 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_cttz_i16_zero_undef:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: addi a1, a0, -1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: lui a2, 5
-; RV64NOZBB-NEXT: addiw a2, a2, 1365
-; RV64NOZBB-NEXT: and a1, a1, a2
-; RV64NOZBB-NEXT: sub a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 3
-; RV64NOZBB-NEXT: addiw a1, a1, 819
-; RV64NOZBB-NEXT: and a2, a0, a1
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: add a0, a2, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 15
-; RV64NOZBB-NEXT: slli a0, a0, 52
-; RV64NOZBB-NEXT: srli a0, a0, 60
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_cttz_i16_zero_undef:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 5
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: slli a0, a0, 48
+; RV64I-NEXT: srli a0, a0, 48
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_cttz_i16_zero_undef:
+; RV64M: # %bb.0:
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: addi a3, a0, -1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 5
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: slli a0, a0, 48
+; RV64M-NEXT: srli a0, a0, 48
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
; RV32ZBB: # %bb.0:
@@ -846,16 +1062,33 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: neg a1, a0
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, %hi(.LCPI7_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI7_0)(a1)
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI7_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI7_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -889,16 +1122,33 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
;
; RV64M-LABEL: test_cttz_i64_zero_undef:
; RV64M: # %bb.0:
-; RV64M-NEXT: lui a1, %hi(.LCPI7_0)
-; RV64M-NEXT: ld a1, %lo(.LCPI7_0)(a1)
-; RV64M-NEXT: neg a2, a0
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: addi a3, a0, -1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 349525
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: slli a5, a4, 32
+; RV64M-NEXT: add a4, a4, a5
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srli a0, a0, 58
-; RV64M-NEXT: lui a1, %hi(.LCPI7_1)
-; RV64M-NEXT: addi a1, a1, %lo(.LCPI7_1)
-; RV64M-NEXT: add a0, a1, a0
-; RV64M-NEXT: lbu a0, 0(a0)
+; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i64_zero_undef:
@@ -984,35 +1234,91 @@ define i8 @test_ctlz_i8(i8 %a) nounwind {
; RV32_NOZBB-NEXT: li a0, 8
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_ctlz_i8:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: andi a1, a0, 255
-; RV64NOZBB-NEXT: beqz a1, .LBB8_2
-; RV64NOZBB-NEXT: # %bb.1: # %cond.false
-; RV64NOZBB-NEXT: slli a1, a0, 56
-; RV64NOZBB-NEXT: srli a1, a1, 57
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 56
-; RV64NOZBB-NEXT: srli a1, a1, 58
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 56
-; RV64NOZBB-NEXT: srli a1, a1, 60
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: andi a0, a0, 51
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a0, a0, 15
-; RV64NOZBB-NEXT: ret
-; RV64NOZBB-NEXT: .LBB8_2:
-; RV64NOZBB-NEXT: li a0, 8
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_ctlz_i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: beqz a1, .LBB8_2
+; RV64I-NEXT: # %bb.1: # %cond.false
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 57
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 58
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 60
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: andi a0, a0, 85
+; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: slli a2, a0, 32
+; RV64I-NEXT: add a2, a0, a2
+; RV64I-NEXT: slli a0, a2, 2
+; RV64I-NEXT: xor a0, a0, a2
+; RV64I-NEXT: and a3, a1, a0
+; RV64I-NEXT: srli a1, a1, 2
+; RV64I-NEXT: and a0, a1, a0
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a1, a2, 3
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: li a0, 8
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_ctlz_i8:
+; RV64M: # %bb.0:
+; RV64M-NEXT: andi a1, a0, 255
+; RV64M-NEXT: beqz a1, .LBB8_2
+; RV64M-NEXT: # %bb.1: # %cond.false
+; RV64M-NEXT: slli a1, a0, 56
+; RV64M-NEXT: srli a1, a1, 57
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 56
+; RV64M-NEXT: srli a1, a1, 58
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 56
+; RV64M-NEXT: srli a1, a1, 60
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: andi a1, a0, 255
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: andi a0, a0, 85
+; RV64M-NEXT: sub a1, a1, a0
+; RV64M-NEXT: lui a0, 61681
+; RV64M-NEXT: addiw a0, a0, -241
+; RV64M-NEXT: slli a2, a0, 32
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: slli a2, a0, 2
+; RV64M-NEXT: xor a2, a2, a0
+; RV64M-NEXT: and a3, a1, a2
+; RV64M-NEXT: srli a1, a1, 2
+; RV64M-NEXT: and a1, a1, a2
+; RV64M-NEXT: add a1, a3, a1
+; RV64M-NEXT: srli a2, a1, 4
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: and a1, a1, a0
+; RV64M-NEXT: srli a2, a0, 3
+; RV64M-NEXT: and a0, a2, a0
+; RV64M-NEXT: mul a0, a1, a0
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
+; RV64M-NEXT: .LBB8_2:
+; RV64M-NEXT: li a0, 8
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i8:
; RV32ZBB: # %bb.0:
@@ -1085,44 +1391,103 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: li a0, 16
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_ctlz_i16:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: beqz a1, .LBB9_2
-; RV64NOZBB-NEXT: # %bb.1: # %cond.false
-; RV64NOZBB-NEXT: srli a1, a1, 49
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 50
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 52
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 56
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: lui a2, 5
-; RV64NOZBB-NEXT: addiw a2, a2, 1365
-; RV64NOZBB-NEXT: and a1, a1, a2
-; RV64NOZBB-NEXT: sub a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 3
-; RV64NOZBB-NEXT: addiw a1, a1, 819
-; RV64NOZBB-NEXT: and a2, a0, a1
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: add a0, a2, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 15
-; RV64NOZBB-NEXT: slli a0, a0, 52
-; RV64NOZBB-NEXT: srli a0, a0, 60
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: ret
-; RV64NOZBB-NEXT: .LBB9_2:
-; RV64NOZBB-NEXT: li a0, 16
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_ctlz_i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: beqz a1, .LBB9_2
+; RV64I-NEXT: # %bb.1: # %cond.false
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: srli a1, a1, 49
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 50
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 52
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 56
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: lui a1, 16
+; RV64I-NEXT: addiw a1, a1, -1
+; RV64I-NEXT: and a1, a0, a1
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: lui a2, 5
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: slli a2, a0, 32
+; RV64I-NEXT: add a2, a0, a2
+; RV64I-NEXT: slli a0, a2, 2
+; RV64I-NEXT: xor a0, a0, a2
+; RV64I-NEXT: and a3, a1, a0
+; RV64I-NEXT: srli a1, a1, 2
+; RV64I-NEXT: and a0, a1, a0
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a1, a2, 3
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+; RV64I-NEXT: .LBB9_2:
+; RV64I-NEXT: li a0, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_ctlz_i16:
+; RV64M: # %bb.0:
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: beqz a1, .LBB9_2
+; RV64M-NEXT: # %bb.1: # %cond.false
+; RV64M-NEXT: srli a1, a1, 49
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 50
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 52
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 56
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: lui a1, 16
+; RV64M-NEXT: addiw a1, a1, -1
+; RV64M-NEXT: and a1, a0, a1
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: lui a2, 5
+; RV64M-NEXT: addiw a2, a2, 1365
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: sub a1, a1, a0
+; RV64M-NEXT: lui a0, 61681
+; RV64M-NEXT: addiw a0, a0, -241
+; RV64M-NEXT: slli a2, a0, 32
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: slli a2, a0, 2
+; RV64M-NEXT: xor a2, a2, a0
+; RV64M-NEXT: and a3, a1, a2
+; RV64M-NEXT: srli a1, a1, 2
+; RV64M-NEXT: and a1, a1, a2
+; RV64M-NEXT: add a1, a3, a1
+; RV64M-NEXT: srli a2, a1, 4
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: and a1, a1, a0
+; RV64M-NEXT: srli a2, a0, 3
+; RV64M-NEXT: and a0, a2, a0
+; RV64M-NEXT: mul a0, a1, a0
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
+; RV64M-NEXT: .LBB9_2:
+; RV64M-NEXT: li a0, 16
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i16:
; RV32ZBB: # %bb.0:
@@ -1222,22 +1587,26 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -1305,22 +1674,26 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
+; RV64M-NEXT: slli a0, a0, 32
+; RV64M-NEXT: srli a0, a0, 32
; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: lui a1, 61681
-; RV64M-NEXT: addi a1, a1, -241
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addi a1, a1, 257
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srliw a0, a0, 24
+; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
; RV64M-NEXT: .LBB10_2:
; RV64M-NEXT: li a0, 32
@@ -1466,25 +1839,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -1583,25 +1952,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV64M-NEXT: add a2, a2, a3
; RV64M-NEXT: and a1, a1, a2
; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: slli a2, a1, 32
-; RV64M-NEXT: add a1, a1, a2
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: lui a1, 61681
; RV64M-NEXT: addiw a1, a1, -241
; RV64M-NEXT: slli a2, a1, 32
; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addiw a1, a1, 257
-; RV64M-NEXT: slli a2, a1, 32
-; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
@@ -1673,29 +2038,79 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
; RV32_NOZBB-NEXT: andi a0, a0, 15
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_ctlz_i8_zero_undef:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: slli a1, a0, 56
-; RV64NOZBB-NEXT: srli a1, a1, 57
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 56
-; RV64NOZBB-NEXT: srli a1, a1, 58
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 56
-; RV64NOZBB-NEXT: srli a1, a1, 60
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: andi a0, a0, 51
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a0, a0, 15
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_ctlz_i8_zero_undef:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 57
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 58
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 56
+; RV64I-NEXT: srli a1, a1, 60
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: andi a0, a0, 85
+; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: slli a2, a0, 32
+; RV64I-NEXT: add a2, a0, a2
+; RV64I-NEXT: slli a0, a2, 2
+; RV64I-NEXT: xor a0, a0, a2
+; RV64I-NEXT: and a3, a1, a0
+; RV64I-NEXT: srli a1, a1, 2
+; RV64I-NEXT: and a0, a1, a0
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a1, a2, 3
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_ctlz_i8_zero_undef:
+; RV64M: # %bb.0:
+; RV64M-NEXT: slli a1, a0, 56
+; RV64M-NEXT: srli a1, a1, 57
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 56
+; RV64M-NEXT: srli a1, a1, 58
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 56
+; RV64M-NEXT: srli a1, a1, 60
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: andi a1, a0, 255
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: andi a0, a0, 85
+; RV64M-NEXT: sub a1, a1, a0
+; RV64M-NEXT: lui a0, 61681
+; RV64M-NEXT: addiw a0, a0, -241
+; RV64M-NEXT: slli a2, a0, 32
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: slli a2, a0, 2
+; RV64M-NEXT: xor a2, a2, a0
+; RV64M-NEXT: and a3, a1, a2
+; RV64M-NEXT: srli a1, a1, 2
+; RV64M-NEXT: and a1, a1, a2
+; RV64M-NEXT: add a1, a3, a1
+; RV64M-NEXT: srli a2, a1, 4
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: and a1, a1, a0
+; RV64M-NEXT: srli a2, a0, 3
+; RV64M-NEXT: and a0, a2, a0
+; RV64M-NEXT: mul a0, a1, a0
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
; RV32ZBB: # %bb.0:
@@ -1763,39 +2178,93 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 49
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 50
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 52
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: slli a1, a0, 48
-; RV64NOZBB-NEXT: srli a1, a1, 56
-; RV64NOZBB-NEXT: or a0, a0, a1
-; RV64NOZBB-NEXT: not a0, a0
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: lui a2, 5
-; RV64NOZBB-NEXT: addiw a2, a2, 1365
-; RV64NOZBB-NEXT: and a1, a1, a2
-; RV64NOZBB-NEXT: sub a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 3
-; RV64NOZBB-NEXT: addiw a1, a1, 819
-; RV64NOZBB-NEXT: and a2, a0, a1
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: add a0, a2, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 15
-; RV64NOZBB-NEXT: slli a0, a0, 52
-; RV64NOZBB-NEXT: srli a0, a0, 60
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_ctlz_i16_zero_undef:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 49
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 50
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 52
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: slli a1, a0, 48
+; RV64I-NEXT: srli a1, a1, 56
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: lui a1, 16
+; RV64I-NEXT: addiw a1, a1, -1
+; RV64I-NEXT: and a1, a0, a1
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: lui a2, 5
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: slli a2, a0, 32
+; RV64I-NEXT: add a2, a0, a2
+; RV64I-NEXT: slli a0, a2, 2
+; RV64I-NEXT: xor a0, a0, a2
+; RV64I-NEXT: and a3, a1, a0
+; RV64I-NEXT: srli a1, a1, 2
+; RV64I-NEXT: and a0, a1, a0
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a1, a2, 3
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_ctlz_i16_zero_undef:
+; RV64M: # %bb.0:
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 49
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 50
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 52
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 48
+; RV64M-NEXT: srli a1, a1, 56
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: not a0, a0
+; RV64M-NEXT: lui a1, 16
+; RV64M-NEXT: addiw a1, a1, -1
+; RV64M-NEXT: and a1, a0, a1
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: lui a2, 5
+; RV64M-NEXT: addiw a2, a2, 1365
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: sub a1, a1, a0
+; RV64M-NEXT: lui a0, 61681
+; RV64M-NEXT: addiw a0, a0, -241
+; RV64M-NEXT: slli a2, a0, 32
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: slli a2, a0, 2
+; RV64M-NEXT: xor a2, a2, a0
+; RV64M-NEXT: and a3, a1, a2
+; RV64M-NEXT: srli a1, a1, 2
+; RV64M-NEXT: and a1, a1, a2
+; RV64M-NEXT: add a1, a3, a1
+; RV64M-NEXT: srli a2, a1, 4
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: and a1, a1, a0
+; RV64M-NEXT: srli a2, a0, 3
+; RV64M-NEXT: and a0, a2, a0
+; RV64M-NEXT: mul a0, a1, a0
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
; RV32ZBB: # %bb.0:
@@ -1887,22 +2356,26 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -1959,22 +2432,26 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
+; RV64M-NEXT: slli a0, a0, 32
+; RV64M-NEXT: srli a0, a0, 32
; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: lui a1, 61681
-; RV64M-NEXT: addi a1, a1, -241
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addi a1, a1, 257
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srliw a0, a0, 24
+; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
@@ -2115,25 +2592,21 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -2227,25 +2700,21 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
; RV64M-NEXT: add a2, a2, a3
; RV64M-NEXT: and a1, a1, a2
; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: slli a2, a1, 32
-; RV64M-NEXT: add a1, a1, a2
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: lui a1, 61681
; RV64M-NEXT: addiw a1, a1, -241
; RV64M-NEXT: slli a2, a1, 32
; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addiw a1, a1, 257
-; RV64M-NEXT: slli a2, a1, 32
-; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
@@ -2304,19 +2773,59 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
; RV32_NOZBB-NEXT: andi a0, a0, 15
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_ctpop_i8:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: andi a0, a0, 51
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a0, a0, 15
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_ctpop_i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: andi a3, a0, 255
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: andi a0, a0, 85
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: srli a0, a3, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: and a2, a3, a2
+; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_ctpop_i8:
+; RV64M: # %bb.0:
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: andi a3, a0, 255
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: andi a0, a0, 85
+; RV64M-NEXT: sub a3, a3, a0
+; RV64M-NEXT: srli a0, a3, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: and a2, a3, a2
+; RV64M-NEXT: add a0, a2, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctpop_i8:
; RV32ZBB: # %bb.0:
@@ -2346,16 +2855,31 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
;
; RV64XTHEADBB-LABEL: test_ctpop_i8:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: srli a1, a0, 1
-; RV64XTHEADBB-NEXT: andi a1, a1, 85
-; RV64XTHEADBB-NEXT: subw a0, a0, a1
-; RV64XTHEADBB-NEXT: andi a1, a0, 51
-; RV64XTHEADBB-NEXT: srli a0, a0, 2
-; RV64XTHEADBB-NEXT: andi a0, a0, 51
-; RV64XTHEADBB-NEXT: add a0, a1, a0
-; RV64XTHEADBB-NEXT: srli a1, a0, 4
-; RV64XTHEADBB-NEXT: add a0, a0, a1
-; RV64XTHEADBB-NEXT: andi a0, a0, 15
+; RV64XTHEADBB-NEXT: addi sp, sp, -16
+; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT: lui a1, 61681
+; RV64XTHEADBB-NEXT: addiw a1, a1, -241
+; RV64XTHEADBB-NEXT: slli a2, a1, 32
+; RV64XTHEADBB-NEXT: add a1, a1, a2
+; RV64XTHEADBB-NEXT: slli a2, a1, 2
+; RV64XTHEADBB-NEXT: xor a2, a2, a1
+; RV64XTHEADBB-NEXT: andi a3, a0, 255
+; RV64XTHEADBB-NEXT: srli a0, a0, 1
+; RV64XTHEADBB-NEXT: andi a0, a0, 85
+; RV64XTHEADBB-NEXT: sub a3, a3, a0
+; RV64XTHEADBB-NEXT: srli a0, a3, 2
+; RV64XTHEADBB-NEXT: and a0, a0, a2
+; RV64XTHEADBB-NEXT: and a2, a3, a2
+; RV64XTHEADBB-NEXT: add a0, a2, a0
+; RV64XTHEADBB-NEXT: srli a2, a0, 4
+; RV64XTHEADBB-NEXT: add a0, a0, a2
+; RV64XTHEADBB-NEXT: and a0, a0, a1
+; RV64XTHEADBB-NEXT: srli a2, a1, 3
+; RV64XTHEADBB-NEXT: and a1, a2, a1
+; RV64XTHEADBB-NEXT: call __muldi3
+; RV64XTHEADBB-NEXT: srli a0, a0, 56
+; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT: addi sp, sp, 16
; RV64XTHEADBB-NEXT: ret
%1 = call i8 @llvm.ctpop.i8(i8 %a)
ret i8 %1
@@ -2383,26 +2907,65 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: ret
;
-; RV64NOZBB-LABEL: test_ctpop_i16:
-; RV64NOZBB: # %bb.0:
-; RV64NOZBB-NEXT: srli a1, a0, 1
-; RV64NOZBB-NEXT: lui a2, 5
-; RV64NOZBB-NEXT: addiw a2, a2, 1365
-; RV64NOZBB-NEXT: and a1, a1, a2
-; RV64NOZBB-NEXT: sub a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 3
-; RV64NOZBB-NEXT: addiw a1, a1, 819
-; RV64NOZBB-NEXT: and a2, a0, a1
-; RV64NOZBB-NEXT: srli a0, a0, 2
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: add a0, a2, a0
-; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: andi a1, a0, 15
-; RV64NOZBB-NEXT: slli a0, a0, 52
-; RV64NOZBB-NEXT: srli a0, a0, 60
-; RV64NOZBB-NEXT: add a0, a1, a0
-; RV64NOZBB-NEXT: ret
+; RV64I-LABEL: test_ctpop_i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 5
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: slli a0, a0, 48
+; RV64I-NEXT: srli a0, a0, 48
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 2
+; RV64I-NEXT: and a3, a3, a2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a0, a3
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
+; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: srli a0, a0, 56
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV64M-LABEL: test_ctpop_i16:
+; RV64M: # %bb.0:
+; RV64M-NEXT: lui a1, 61681
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 5
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: slli a0, a0, 48
+; RV64M-NEXT: srli a0, a0, 48
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: srli a3, a0, 2
+; RV64M-NEXT: and a3, a3, a2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a0, a3
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: srli a0, a0, 56
+; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctpop_i16:
; RV32ZBB: # %bb.0:
@@ -2439,23 +3002,33 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
;
; RV64XTHEADBB-LABEL: test_ctpop_i16:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: srli a1, a0, 1
-; RV64XTHEADBB-NEXT: lui a2, 5
-; RV64XTHEADBB-NEXT: addiw a2, a2, 1365
-; RV64XTHEADBB-NEXT: and a1, a1, a2
-; RV64XTHEADBB-NEXT: sub a0, a0, a1
-; RV64XTHEADBB-NEXT: lui a1, 3
-; RV64XTHEADBB-NEXT: addiw a1, a1, 819
-; RV64XTHEADBB-NEXT: and a2, a0, a1
-; RV64XTHEADBB-NEXT: srli a0, a0, 2
+; RV64XTHEADBB-NEXT: addi sp, sp, -16
+; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT: lui a1, 61681
+; RV64XTHEADBB-NEXT: addiw a1, a1, -241
+; RV64XTHEADBB-NEXT: slli a2, a1, 32
+; RV64XTHEADBB-NEXT: add a1, a1, a2
+; RV64XTHEADBB-NEXT: slli a2, a1, 2
+; RV64XTHEADBB-NEXT: xor a2, a2, a1
+; RV64XTHEADBB-NEXT: srli a3, a0, 1
+; RV64XTHEADBB-NEXT: lui a4, 5
+; RV64XTHEADBB-NEXT: addiw a4, a4, 1365
+; RV64XTHEADBB-NEXT: and a3, a3, a4
+; RV64XTHEADBB-NEXT: th.extu a0, a0, 15, 0
+; RV64XTHEADBB-NEXT: sub a0, a0, a3
+; RV64XTHEADBB-NEXT: srli a3, a0, 2
+; RV64XTHEADBB-NEXT: and a3, a3, a2
+; RV64XTHEADBB-NEXT: and a0, a0, a2
+; RV64XTHEADBB-NEXT: add a0, a0, a3
+; RV64XTHEADBB-NEXT: srli a2, a0, 4
+; RV64XTHEADBB-NEXT: add a0, a0, a2
; RV64XTHEADBB-NEXT: and a0, a0, a1
-; RV64XTHEADBB-NEXT: add a0, a2, a0
-; RV64XTHEADBB-NEXT: srli a1, a0, 4
-; RV64XTHEADBB-NEXT: add a0, a0, a1
-; RV64XTHEADBB-NEXT: andi a1, a0, 15
-; RV64XTHEADBB-NEXT: slli a0, a0, 52
-; RV64XTHEADBB-NEXT: srli a0, a0, 60
-; RV64XTHEADBB-NEXT: add a0, a1, a0
+; RV64XTHEADBB-NEXT: srli a2, a1, 3
+; RV64XTHEADBB-NEXT: and a1, a2, a1
+; RV64XTHEADBB-NEXT: call __muldi3
+; RV64XTHEADBB-NEXT: srli a0, a0, 56
+; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT: addi sp, sp, 16
; RV64XTHEADBB-NEXT: ret
%1 = call i16 @llvm.ctpop.i16(i16 %a)
ret i16 %1
@@ -2494,26 +3067,30 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 2
+; RV64I-NEXT: and a3, a3, a2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a0, a3
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -2544,26 +3121,30 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
;
; RV64M-LABEL: test_ctpop_i32:
; RV64M: # %bb.0:
-; RV64M-NEXT: srli a1, a0, 1
-; RV64M-NEXT: lui a2, 349525
-; RV64M-NEXT: addiw a2, a2, 1365
-; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: lui a1, 61681
-; RV64M-NEXT: addi a1, a1, -241
+; RV64M-NEXT: addiw a1, a1, -241
+; RV64M-NEXT: slli a2, a1, 32
+; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 349525
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: slli a0, a0, 32
+; RV64M-NEXT: srli a0, a0, 32
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: srli a3, a0, 2
+; RV64M-NEXT: and a3, a3, a2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a0, a3
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addi a1, a1, 257
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srliw a0, a0, 24
+; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
;
; RV32ZBB-LABEL: test_ctpop_i32:
@@ -2608,26 +3189,29 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: addi sp, sp, -16
; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64XTHEADBB-NEXT: srli a1, a0, 1
-; RV64XTHEADBB-NEXT: lui a2, 349525
-; RV64XTHEADBB-NEXT: addiw a2, a2, 1365
-; RV64XTHEADBB-NEXT: and a1, a1, a2
-; RV64XTHEADBB-NEXT: sub a0, a0, a1
-; RV64XTHEADBB-NEXT: lui a1, 209715
-; RV64XTHEADBB-NEXT: addiw a1, a1, 819
-; RV64XTHEADBB-NEXT: and a2, a0, a1
-; RV64XTHEADBB-NEXT: srli a0, a0, 2
-; RV64XTHEADBB-NEXT: and a0, a0, a1
-; RV64XTHEADBB-NEXT: add a0, a2, a0
-; RV64XTHEADBB-NEXT: srli a1, a0, 4
-; RV64XTHEADBB-NEXT: add a0, a0, a1
; RV64XTHEADBB-NEXT: lui a1, 61681
; RV64XTHEADBB-NEXT: addiw a1, a1, -241
+; RV64XTHEADBB-NEXT: slli a2, a1, 32
+; RV64XTHEADBB-NEXT: add a1, a1, a2
+; RV64XTHEADBB-NEXT: slli a2, a1, 2
+; RV64XTHEADBB-NEXT: xor a2, a2, a1
+; RV64XTHEADBB-NEXT: srli a3, a0, 1
+; RV64XTHEADBB-NEXT: lui a4, 349525
+; RV64XTHEADBB-NEXT: addiw a4, a4, 1365
+; RV64XTHEADBB-NEXT: and a3, a3, a4
+; RV64XTHEADBB-NEXT: th.extu a0, a0, 31, 0
+; RV64XTHEADBB-NEXT: sub a0, a0, a3
+; RV64XTHEADBB-NEXT: srli a3, a0, 2
+; RV64XTHEADBB-NEXT: and a3, a3, a2
+; RV64XTHEADBB-NEXT: and a0, a0, a2
+; RV64XTHEADBB-NEXT: add a0, a0, a3
+; RV64XTHEADBB-NEXT: srli a2, a0, 4
+; RV64XTHEADBB-NEXT: add a0, a0, a2
; RV64XTHEADBB-NEXT: and a0, a0, a1
-; RV64XTHEADBB-NEXT: lui a1, 4112
-; RV64XTHEADBB-NEXT: addiw a1, a1, 257
+; RV64XTHEADBB-NEXT: srli a2, a1, 3
+; RV64XTHEADBB-NEXT: and a1, a2, a1
; RV64XTHEADBB-NEXT: call __muldi3
-; RV64XTHEADBB-NEXT: srliw a0, a0, 24
+; RV64XTHEADBB-NEXT: srli a0, a0, 56
; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64XTHEADBB-NEXT: addi sp, sp, 16
; RV64XTHEADBB-NEXT: ret
@@ -2697,32 +3281,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: slli a3, a2, 32
-; RV64I-NEXT: add a2, a2, a3
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -2769,32 +3349,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
;
; RV64M-LABEL: test_ctpop_i64:
; RV64M: # %bb.0:
-; RV64M-NEXT: srli a1, a0, 1
-; RV64M-NEXT: lui a2, 349525
-; RV64M-NEXT: addiw a2, a2, 1365
-; RV64M-NEXT: slli a3, a2, 32
-; RV64M-NEXT: add a2, a2, a3
-; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: slli a2, a1, 32
-; RV64M-NEXT: add a1, a1, a2
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: lui a1, 61681
; RV64M-NEXT: addiw a1, a1, -241
; RV64M-NEXT: slli a2, a1, 32
; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: slli a2, a1, 2
+; RV64M-NEXT: xor a2, a2, a1
+; RV64M-NEXT: srli a3, a0, 1
+; RV64M-NEXT: lui a4, 349525
+; RV64M-NEXT: addiw a4, a4, 1365
+; RV64M-NEXT: slli a5, a4, 32
+; RV64M-NEXT: add a4, a4, a5
+; RV64M-NEXT: and a3, a3, a4
+; RV64M-NEXT: sub a0, a0, a3
+; RV64M-NEXT: and a3, a0, a2
+; RV64M-NEXT: srli a0, a0, 2
+; RV64M-NEXT: and a0, a0, a2
+; RV64M-NEXT: add a0, a3, a0
+; RV64M-NEXT: srli a2, a0, 4
+; RV64M-NEXT: add a0, a0, a2
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addiw a1, a1, 257
-; RV64M-NEXT: slli a2, a1, 32
-; RV64M-NEXT: add a1, a1, a2
+; RV64M-NEXT: srli a2, a1, 3
+; RV64M-NEXT: and a1, a2, a1
; RV64M-NEXT: mul a0, a0, a1
; RV64M-NEXT: srli a0, a0, 56
; RV64M-NEXT: ret
@@ -2873,32 +3449,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: addi sp, sp, -16
; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64XTHEADBB-NEXT: srli a1, a0, 1
-; RV64XTHEADBB-NEXT: lui a2, 349525
-; RV64XTHEADBB-NEXT: addiw a2, a2, 1365
-; RV64XTHEADBB-NEXT: slli a3, a2, 32
-; RV64XTHEADBB-NEXT: add a2, a2, a3
-; RV64XTHEADBB-NEXT: and a1, a1, a2
-; RV64XTHEADBB-NEXT: sub a0, a0, a1
-; RV64XTHEADBB-NEXT: lui a1, 209715
-; RV64XTHEADBB-NEXT: addiw a1, a1, 819
-; RV64XTHEADBB-NEXT: slli a2, a1, 32
-; RV64XTHEADBB-NEXT: add a1, a1, a2
-; RV64XTHEADBB-NEXT: and a2, a0, a1
-; RV64XTHEADBB-NEXT: srli a0, a0, 2
-; RV64XTHEADBB-NEXT: and a0, a0, a1
-; RV64XTHEADBB-NEXT: add a0, a2, a0
-; RV64XTHEADBB-NEXT: srli a1, a0, 4
-; RV64XTHEADBB-NEXT: add a0, a0, a1
; RV64XTHEADBB-NEXT: lui a1, 61681
; RV64XTHEADBB-NEXT: addiw a1, a1, -241
; RV64XTHEADBB-NEXT: slli a2, a1, 32
; RV64XTHEADBB-NEXT: add a1, a1, a2
+; RV64XTHEADBB-NEXT: slli a2, a1, 2
+; RV64XTHEADBB-NEXT: xor a2, a2, a1
+; RV64XTHEADBB-NEXT: srli a3, a0, 1
+; RV64XTHEADBB-NEXT: lui a4, 349525
+; RV64XTHEADBB-NEXT: addiw a4, a4, 1365
+; RV64XTHEADBB-NEXT: slli a5, a4, 32
+; RV64XTHEADBB-NEXT: add a4, a4, a5
+; RV64XTHEADBB-NEXT: and a3, a3, a4
+; RV64XTHEADBB-NEXT: sub a0, a0, a3
+; RV64XTHEADBB-NEXT: and a3, a0, a2
+; RV64XTHEADBB-NEXT: srli a0, a0, 2
+; RV64XTHEADBB-NEXT: and a0, a0, a2
+; RV64XTHEADBB-NEXT: add a0, a3, a0
+; RV64XTHEADBB-NEXT: srli a2, a0, 4
+; RV64XTHEADBB-NEXT: add a0, a0, a2
; RV64XTHEADBB-NEXT: and a0, a0, a1
-; RV64XTHEADBB-NEXT: lui a1, 4112
-; RV64XTHEADBB-NEXT: addiw a1, a1, 257
-; RV64XTHEADBB-NEXT: slli a2, a1, 32
-; RV64XTHEADBB-NEXT: add a1, a1, a2
+; RV64XTHEADBB-NEXT: srli a2, a1, 3
+; RV64XTHEADBB-NEXT: and a1, a2, a1
; RV64XTHEADBB-NEXT: call __muldi3
; RV64XTHEADBB-NEXT: srli a0, a0, 56
; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index adf614435b31d7..007bbf79d23639 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -85,24 +85,36 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind {
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT: ld s0, 0(a0)
-; RV64I-NEXT: neg a0, s0
-; RV64I-NEXT: and a0, s0, a0
-; RV64I-NEXT: lui a1, %hi(.LCPI0_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI0_0)(a1)
+; RV64I-NEXT: ld a0, 0(a0)
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI0_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI0_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
-; RV64I-NEXT: seqz a1, s0
-; RV64I-NEXT: addi a1, a1, -1
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: andi a0, a0, 63
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -548,24 +560,35 @@ define signext i32 @ctz4(i64 %b) nounwind {
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT: mv s0, a0
-; RV64I-NEXT: neg a0, a0
-; RV64I-NEXT: and a0, s0, a0
-; RV64I-NEXT: lui a1, %hi(.LCPI6_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI6_0)(a1)
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI6_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
-; RV64I-NEXT: seqz a1, s0
-; RV64I-NEXT: addi a1, a1, -1
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: andi a0, a0, 63
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -712,25 +735,21 @@ define signext i32 @ctlz(i64 %b) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: srli a0, a0, 58
diff --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll
index ba08aa838bf992..bb7239862de1e0 100644
--- a/llvm/test/CodeGen/RISCV/pr56457.ll
+++ b/llvm/test/CodeGen/RISCV/pr56457.ll
@@ -28,25 +28,21 @@ define i15 @foo(i15 %x) nounwind {
; CHECK-NEXT: slli a0, a0, 49
; CHECK-NEXT: srli a0, a0, 49
; CHECK-NEXT: sub a0, a0, a1
-; CHECK-NEXT: lui a1, 209715
-; CHECK-NEXT: addiw a1, a1, 819
-; CHECK-NEXT: slli a2, a1, 32
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: and a2, a0, a1
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: and a0, a0, a1
-; CHECK-NEXT: add a0, a2, a0
-; CHECK-NEXT: srli a1, a0, 4
-; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: lui a1, 61681
; CHECK-NEXT: addiw a1, a1, -241
; CHECK-NEXT: slli a2, a1, 32
; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 2
+; CHECK-NEXT: xor a2, a2, a1
+; CHECK-NEXT: and a3, a0, a2
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: add a0, a3, a0
+; CHECK-NEXT: srli a2, a0, 4
+; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: and a0, a0, a1
-; CHECK-NEXT: lui a1, 4112
-; CHECK-NEXT: addiw a1, a1, 257
-; CHECK-NEXT: slli a2, a1, 32
-; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: srli a2, a1, 3
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: srli a0, a0, 56
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
index 73bfc6480b4d75..929cf7d35d674b 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
@@ -339,25 +339,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -591,16 +587,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV64I-NEXT: # %bb.1: # %cond.false
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: neg a1, a0
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
index 7feef4dad4116a..268488b14d8069 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
@@ -329,25 +329,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -514,16 +510,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV64I-NEXT: # %bb.1: # %cond.false
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: neg a1, a0
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -625,32 +638,28 @@ define i64 @ctpop_i64(i64 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: slli a3, a2, 32
-; RV64I-NEXT: add a2, a2, a3
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 1f62ea9f568191..7684a904ae9ce0 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -28,22 +28,26 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -83,22 +87,26 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: j .LBB1_3
@@ -148,22 +156,26 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a1, a0, 24
+; RV64I-NEXT: srli a1, a0, 56
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: sub a0, s0, a1
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -208,22 +220,26 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: xori a0, a0, 31
; RV64I-NEXT: snez a1, s0
; RV64I-NEXT: addi a1, a1, -1
@@ -275,22 +291,26 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -339,25 +359,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -548,16 +564,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV64I-NEXT: # %bb.1: # %cond.false
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: neg a1, a0
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 2269d8d04c9cb0..b8d4d9706dc0ad 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -28,22 +28,26 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -81,22 +85,26 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: j .LBB1_3
@@ -144,22 +152,26 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a1, a0, 24
+; RV64I-NEXT: srli a1, a0, 56
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: sub a0, s0, a1
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -202,22 +214,26 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: xori a0, a0, 31
; RV64I-NEXT: snez a1, s0
; RV64I-NEXT: addi a1, a1, -1
@@ -267,22 +283,26 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -329,25 +349,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -514,16 +530,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV64I-NEXT: # %bb.1: # %cond.false
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: neg a1, a0
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: addi a3, a0, -1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srli a0, a0, 58
-; RV64I-NEXT: lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: lbu a0, 0(a0)
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -546,26 +579,30 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: slli a1, a0, 32
+; RV64I-NEXT: srli a1, a1, 32
+; RV64I-NEXT: lui a2, 61681
+; RV64I-NEXT: addiw a2, a2, -241
+; RV64I-NEXT: slli a3, a2, 32
+; RV64I-NEXT: add a2, a2, a3
+; RV64I-NEXT: slli a3, a2, 2
+; RV64I-NEXT: xor a3, a3, a2
+; RV64I-NEXT: srli a0, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: and a0, a0, a4
+; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: srli a0, a1, 2
+; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: and a1, a1, a3
+; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: srli a1, a2, 3
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -659,27 +696,29 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lw a0, 0(a0)
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: lwu a0, 0(a0)
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: srli a3, a0, 2
+; RV64I-NEXT: and a3, a3, a2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a0, a3
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
@@ -707,41 +746,48 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a1
-; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: srli s3, a1, 32
+; RV64I-NEXT: slli a1, a0, 32
+; RV64I-NEXT: srli a1, a1, 32
+; RV64I-NEXT: lui a2, 61681
+; RV64I-NEXT: addiw a2, a2, -241
+; RV64I-NEXT: slli a3, a2, 32
+; RV64I-NEXT: add s4, a2, a3
+; RV64I-NEXT: slli a2, s4, 2
+; RV64I-NEXT: xor s5, a2, s4
+; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw s3, a2, 1365
-; RV64I-NEXT: and a1, a1, s3
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw s4, a1, 819
-; RV64I-NEXT: and a1, a0, s4
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, s4
+; RV64I-NEXT: addiw s6, a2, 1365
+; RV64I-NEXT: and a0, a0, s6
+; RV64I-NEXT: sub a1, a1, a0
+; RV64I-NEXT: srli a0, a1, 2
+; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: and a1, a1, s5
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw s5, a1, -241
-; RV64I-NEXT: and a0, a0, s5
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw s1, a1, 257
+; RV64I-NEXT: and a0, a0, s4
+; RV64I-NEXT: srli a1, s4, 3
+; RV64I-NEXT: and s1, a1, s4
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw s2, a0, 24
-; RV64I-NEXT: srli a0, s0, 1
-; RV64I-NEXT: and a0, a0, s3
-; RV64I-NEXT: sub s0, s0, a0
-; RV64I-NEXT: and a0, s0, s4
-; RV64I-NEXT: srli s0, s0, 2
-; RV64I-NEXT: and a1, s0, s4
+; RV64I-NEXT: srli s2, a0, 56
+; RV64I-NEXT: srli s0, s0, 1
+; RV64I-NEXT: and a0, s0, s6
+; RV64I-NEXT: sub a0, s3, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: and a1, a1, s5
+; RV64I-NEXT: and a0, a0, s5
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: and a0, a0, s4
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a1, a0, 24
+; RV64I-NEXT: srli a1, a0, 56
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
@@ -750,6 +796,7 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 0(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
@@ -877,32 +924,28 @@ define i64 @ctpop_i64(i64 %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: slli a3, a2, 32
-; RV64I-NEXT: add a2, a2, a3
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: slli a2, a1, 2
+; RV64I-NEXT: xor a2, a2, a1
+; RV64I-NEXT: srli a3, a0, 1
+; RV64I-NEXT: lui a4, 349525
+; RV64I-NEXT: addiw a4, a4, 1365
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: add a4, a4, a5
+; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: sub a0, a0, a3
+; RV64I-NEXT: and a3, a0, a2
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: add a0, a3, a0
+; RV64I-NEXT: srli a2, a0, 4
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: srli a2, a1, 3
+; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -1007,37 +1050,33 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a1
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: add s3, a1, a2
+; RV64I-NEXT: slli a1, s3, 2
+; RV64I-NEXT: xor s4, a1, s3
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: slli a3, a2, 32
-; RV64I-NEXT: add s3, a2, a3
-; RV64I-NEXT: and a1, a1, s3
+; RV64I-NEXT: add s5, a2, a3
+; RV64I-NEXT: and a1, a1, s5
; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add s4, a1, a2
; RV64I-NEXT: and a1, a0, s4
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: and a0, a0, s4
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add s5, a1, a2
-; RV64I-NEXT: and a0, a0, s5
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw s1, a1, 257
-; RV64I-NEXT: slli a1, s1, 32
-; RV64I-NEXT: add s1, s1, a1
+; RV64I-NEXT: and a0, a0, s3
+; RV64I-NEXT: srli a1, s3, 3
+; RV64I-NEXT: and s1, a1, s3
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli s2, a0, 56
; RV64I-NEXT: srli a0, s0, 1
-; RV64I-NEXT: and a0, a0, s3
+; RV64I-NEXT: and a0, a0, s5
; RV64I-NEXT: sub s0, s0, a0
; RV64I-NEXT: and a0, s0, s4
; RV64I-NEXT: srli s0, s0, 2
@@ -1045,7 +1084,7 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: and a0, a0, s5
+; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srli a1, a0, 56
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index fc94f8c2a52797..4b1bfa48a95d82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1147,35 +1147,26 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v10, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
+; RV32I-NEXT: vsll.vi v11, v10, 2
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: vadd.vv v12, v11, v11
+; RV32I-NEXT: vxor.vv v12, v11, v12
+; RV32I-NEXT: vand.vv v9, v9, v12
; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v11
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v11
+; RV32I-NEXT: vadd.vv v8, v9, v8
; RV32I-NEXT: vsrl.vi v9, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v9, v10, 3
+; RV32I-NEXT: vand.vv v9, v10, v9
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -1199,32 +1190,27 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
; RV64I-NEXT: vor.vv v8, v8, v9
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vmv.v.x v10, a0
+; RV64I-NEXT: vsll.vi v11, v10, 2
+; RV64I-NEXT: vxor.vx v11, v11, a0
+; RV64I-NEXT: vadd.vv v12, v11, v11
+; RV64I-NEXT: vxor.vv v12, v11, v12
+; RV64I-NEXT: vand.vv v9, v9, v12
; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vand.vv v9, v8, v11
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v11
; RV64I-NEXT: vadd.vv v8, v9, v8
; RV64I-NEXT: vsrl.vi v9, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v9
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v9, v10, 3
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -1288,35 +1274,26 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v12, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
+; RV32I-NEXT: vsll.vi v14, v12, 2
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: vadd.vv v16, v14, v14
+; RV32I-NEXT: vxor.vv v16, v14, v16
+; RV32I-NEXT: vand.vv v10, v10, v16
; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v14
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v14
+; RV32I-NEXT: vadd.vv v8, v10, v8
; RV32I-NEXT: vsrl.vi v10, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v10, v12, 3
+; RV32I-NEXT: vand.vv v10, v12, v10
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -1340,32 +1317,27 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
; RV64I-NEXT: vor.vv v8, v8, v10
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vmv.v.x v12, a0
+; RV64I-NEXT: vsll.vi v14, v12, 2
+; RV64I-NEXT: vxor.vx v14, v14, a0
+; RV64I-NEXT: vadd.vv v16, v14, v14
+; RV64I-NEXT: vxor.vv v16, v14, v16
+; RV64I-NEXT: vand.vv v10, v10, v16
; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vand.vv v10, v8, v14
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v14
; RV64I-NEXT: vadd.vv v8, v10, v8
; RV64I-NEXT: vsrl.vi v10, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v10
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v10, v12, 3
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -1429,35 +1401,26 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v12
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v12, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32I-NEXT: vmv.v.x v16, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v12, v12, v16
+; RV32I-NEXT: vsll.vi v20, v16, 2
+; RV32I-NEXT: vxor.vv v20, v16, v20
+; RV32I-NEXT: vadd.vv v24, v20, v20
+; RV32I-NEXT: vxor.vv v24, v20, v24
+; RV32I-NEXT: vand.vv v12, v12, v24
; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v20
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vand.vv v8, v8, v20
+; RV32I-NEXT: vadd.vv v8, v12, v8
; RV32I-NEXT: vsrl.vi v12, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v12, v16, 3
+; RV32I-NEXT: vand.vv v12, v16, v12
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -1481,32 +1444,27 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
; RV64I-NEXT: vor.vv v8, v8, v12
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v12, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vmv.v.x v16, a0
+; RV64I-NEXT: vsll.vi v20, v16, 2
+; RV64I-NEXT: vxor.vx v20, v20, a0
+; RV64I-NEXT: vadd.vv v24, v20, v20
+; RV64I-NEXT: vxor.vv v24, v20, v24
+; RV64I-NEXT: vand.vv v12, v12, v24
; RV64I-NEXT: vsub.vv v8, v8, v12
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vand.vv v12, v8, v20
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v20
; RV64I-NEXT: vadd.vv v8, v12, v8
; RV64I-NEXT: vsrl.vi v12, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v12
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v12, v16, 3
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vmul.vv v8, v8, v12
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -1554,6 +1512,12 @@ declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: ctlz_nxv8i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: sub sp, sp, a0
+; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV32I-NEXT: vsrl.vi v16, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v16
@@ -1569,43 +1533,58 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-NEXT: vsrl.vx v16, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v16
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v24, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v16, v16, v24
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v24, v8, v16
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vsrl.vi v8, v8, 1
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
+; RV32I-NEXT: vmv.v.x v8, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vmul.vv v8, v8, v16
+; RV32I-NEXT: vsll.vi v0, v8, 2
+; RV32I-NEXT: vxor.vv v0, v8, v0
+; RV32I-NEXT: vadd.vv v24, v0, v0
+; RV32I-NEXT: vxor.vv v24, v0, v24
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vand.vv v24, v16, v24
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vsub.vv v16, v16, v24
+; RV32I-NEXT: vand.vv v24, v16, v0
+; RV32I-NEXT: vsrl.vi v16, v16, 2
+; RV32I-NEXT: vand.vv v16, v16, v0
+; RV32I-NEXT: vadd.vv v16, v24, v16
+; RV32I-NEXT: vsrl.vi v24, v16, 4
+; RV32I-NEXT: vadd.vv v16, v16, v24
+; RV32I-NEXT: vand.vv v16, v16, v8
+; RV32I-NEXT: vsrl.vi v24, v8, 3
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vmul.vv v8, v16, v8
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add sp, sp, a0
+; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_nxv8i64:
; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: sub sp, sp, a0
+; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV64I-NEXT: vsrl.vi v16, v8, 1
; RV64I-NEXT: vor.vv v8, v8, v16
@@ -1620,36 +1599,49 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV64I-NEXT: li a0, 32
; RV64I-NEXT: vsrl.vx v16, v8, a0
; RV64I-NEXT: vor.vv v8, v8, v16
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vsrl.vi v16, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v16, a0
-; RV64I-NEXT: vsub.vv v8, v8, v16
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v16, v8
-; RV64I-NEXT: vsrl.vi v16, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: vnot.v v16, v8
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vmv.v.x v8, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT: vsll.vi v24, v8, 2
+; RV64I-NEXT: vxor.vx v24, v24, a0
+; RV64I-NEXT: vadd.vv v0, v24, v24
+; RV64I-NEXT: vxor.vv v0, v24, v0
+; RV64I-NEXT: csrr a1, vlenb
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: add a1, sp, a1
+; RV64I-NEXT: addi a1, a1, 16
+; RV64I-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT: vsrl.vi v0, v16, 1
+; RV64I-NEXT: csrr a1, vlenb
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: add a1, sp, a1
+; RV64I-NEXT: addi a1, a1, 16
+; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT: vand.vv v0, v0, v8
+; RV64I-NEXT: vsub.vv v16, v16, v0
+; RV64I-NEXT: vand.vv v0, v16, v24
+; RV64I-NEXT: vsrl.vi v16, v16, 2
+; RV64I-NEXT: vand.vv v16, v16, v24
+; RV64I-NEXT: vadd.vv v16, v0, v16
+; RV64I-NEXT: vsrl.vi v24, v16, 4
+; RV64I-NEXT: vadd.vv v16, v16, v24
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT: vsrl.vi v8, v8, 3
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vmul.vv v8, v16, v8
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: add sp, sp, a0
+; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
;
; CHECK-F-LABEL: ctlz_nxv8i64:
@@ -2753,35 +2745,26 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v10, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
+; RV32I-NEXT: vsll.vi v11, v10, 2
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: vadd.vv v12, v11, v11
+; RV32I-NEXT: vxor.vv v12, v11, v12
+; RV32I-NEXT: vand.vv v9, v9, v12
; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v11
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v11
+; RV32I-NEXT: vadd.vv v8, v9, v8
; RV32I-NEXT: vsrl.vi v9, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v9, v10, 3
+; RV32I-NEXT: vand.vv v9, v10, v9
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -2805,32 +2788,27 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
; RV64I-NEXT: vor.vv v8, v8, v9
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vmv.v.x v10, a0
+; RV64I-NEXT: vsll.vi v11, v10, 2
+; RV64I-NEXT: vxor.vx v11, v11, a0
+; RV64I-NEXT: vadd.vv v12, v11, v11
+; RV64I-NEXT: vxor.vv v12, v11, v12
+; RV64I-NEXT: vand.vv v9, v9, v12
; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vand.vv v9, v8, v11
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v11
; RV64I-NEXT: vadd.vv v8, v9, v8
; RV64I-NEXT: vsrl.vi v9, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v9
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v9, v10, 3
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -2889,35 +2867,26 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v12, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
+; RV32I-NEXT: vsll.vi v14, v12, 2
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: vadd.vv v16, v14, v14
+; RV32I-NEXT: vxor.vv v16, v14, v16
+; RV32I-NEXT: vand.vv v10, v10, v16
; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v14
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v14
+; RV32I-NEXT: vadd.vv v8, v10, v8
; RV32I-NEXT: vsrl.vi v10, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v10, v12, 3
+; RV32I-NEXT: vand.vv v10, v12, v10
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -2941,32 +2910,27 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
; RV64I-NEXT: vor.vv v8, v8, v10
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vmv.v.x v12, a0
+; RV64I-NEXT: vsll.vi v14, v12, 2
+; RV64I-NEXT: vxor.vx v14, v14, a0
+; RV64I-NEXT: vadd.vv v16, v14, v14
+; RV64I-NEXT: vxor.vv v16, v14, v16
+; RV64I-NEXT: vand.vv v10, v10, v16
; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vand.vv v10, v8, v14
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v14
; RV64I-NEXT: vadd.vv v8, v10, v8
; RV64I-NEXT: vsrl.vi v10, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v10
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v10, v12, 3
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -3025,35 +2989,26 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v12
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v12, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32I-NEXT: vmv.v.x v16, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v12, v12, v16
+; RV32I-NEXT: vsll.vi v20, v16, 2
+; RV32I-NEXT: vxor.vv v20, v16, v20
+; RV32I-NEXT: vadd.vv v24, v20, v20
+; RV32I-NEXT: vxor.vv v24, v20, v24
+; RV32I-NEXT: vand.vv v12, v12, v24
; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v20
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vand.vv v8, v8, v20
+; RV32I-NEXT: vadd.vv v8, v12, v8
; RV32I-NEXT: vsrl.vi v12, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v12, v16, 3
+; RV32I-NEXT: vand.vv v12, v16, v12
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -3077,32 +3032,27 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
; RV64I-NEXT: vor.vv v8, v8, v12
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v12, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vmv.v.x v16, a0
+; RV64I-NEXT: vsll.vi v20, v16, 2
+; RV64I-NEXT: vxor.vx v20, v20, a0
+; RV64I-NEXT: vadd.vv v24, v20, v20
+; RV64I-NEXT: vxor.vv v24, v20, v24
+; RV64I-NEXT: vand.vv v12, v12, v24
; RV64I-NEXT: vsub.vv v8, v8, v12
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vand.vv v12, v8, v20
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v20
; RV64I-NEXT: vadd.vv v8, v12, v8
; RV64I-NEXT: vsrl.vi v12, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v12
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v12, v16, 3
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vmul.vv v8, v8, v12
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -3145,6 +3095,12 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: ctlz_zero_undef_nxv8i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: sub sp, sp, a0
+; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV32I-NEXT: vsrl.vi v16, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v16
@@ -3160,43 +3116,58 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-NEXT: vsrl.vx v16, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v16
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v24, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v16, v16, v24
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v24, v8, v16
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vsrl.vi v8, v8, 1
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
+; RV32I-NEXT: vmv.v.x v8, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vmul.vv v8, v8, v16
+; RV32I-NEXT: vsll.vi v0, v8, 2
+; RV32I-NEXT: vxor.vv v0, v8, v0
+; RV32I-NEXT: vadd.vv v24, v0, v0
+; RV32I-NEXT: vxor.vv v24, v0, v24
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vand.vv v24, v16, v24
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vsub.vv v16, v16, v24
+; RV32I-NEXT: vand.vv v24, v16, v0
+; RV32I-NEXT: vsrl.vi v16, v16, 2
+; RV32I-NEXT: vand.vv v16, v16, v0
+; RV32I-NEXT: vadd.vv v16, v24, v16
+; RV32I-NEXT: vsrl.vi v24, v16, 4
+; RV32I-NEXT: vadd.vv v16, v16, v24
+; RV32I-NEXT: vand.vv v16, v16, v8
+; RV32I-NEXT: vsrl.vi v24, v8, 3
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vmul.vv v8, v16, v8
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add sp, sp, a0
+; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_zero_undef_nxv8i64:
; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: sub sp, sp, a0
+; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV64I-NEXT: vsrl.vi v16, v8, 1
; RV64I-NEXT: vor.vv v8, v8, v16
@@ -3211,36 +3182,49 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
; RV64I-NEXT: li a0, 32
; RV64I-NEXT: vsrl.vx v16, v8, a0
; RV64I-NEXT: vor.vv v8, v8, v16
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vsrl.vi v16, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v16, a0
-; RV64I-NEXT: vsub.vv v8, v8, v16
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v16, v8
-; RV64I-NEXT: vsrl.vi v16, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: vnot.v v16, v8
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vmv.v.x v8, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT: vsll.vi v24, v8, 2
+; RV64I-NEXT: vxor.vx v24, v24, a0
+; RV64I-NEXT: vadd.vv v0, v24, v24
+; RV64I-NEXT: vxor.vv v0, v24, v0
+; RV64I-NEXT: csrr a1, vlenb
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: add a1, sp, a1
+; RV64I-NEXT: addi a1, a1, 16
+; RV64I-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT: vsrl.vi v0, v16, 1
+; RV64I-NEXT: csrr a1, vlenb
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: add a1, sp, a1
+; RV64I-NEXT: addi a1, a1, 16
+; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT: vand.vv v0, v0, v8
+; RV64I-NEXT: vsub.vv v16, v16, v0
+; RV64I-NEXT: vand.vv v0, v16, v24
+; RV64I-NEXT: vsrl.vi v16, v16, 2
+; RV64I-NEXT: vand.vv v16, v16, v24
+; RV64I-NEXT: vadd.vv v16, v0, v16
+; RV64I-NEXT: vsrl.vi v24, v16, 4
+; RV64I-NEXT: vadd.vv v16, v16, v24
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT: vsrl.vi v8, v8, 3
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vmul.vv v8, v16, v8
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: add sp, sp, a0
+; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
;
; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index c310274d685081..a33e7b54288d88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -675,37 +675,27 @@ declare <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32>)
define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
; RV32-LABEL: ctpop_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a0
; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -713,34 +703,29 @@ define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
;
; RV64-LABEL: ctpop_nxv1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
; RV64-NEXT: lui a0, 61681
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: slli a1, a0, 32
; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a0
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a0
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -758,37 +743,27 @@ declare <vscale x 1 x i64> @llvm.ctpop.nxv1i64(<vscale x 1 x i64>)
define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
; RV32-LABEL: ctpop_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a0
; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -796,34 +771,29 @@ define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
;
; RV64-LABEL: ctpop_nxv2i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
; RV64-NEXT: lui a0, 61681
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: slli a1, a0, 32
; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a0
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a0
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -841,37 +811,27 @@ declare <vscale x 2 x i64> @llvm.ctpop.nxv2i64(<vscale x 2 x i64>)
define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
; RV32-LABEL: ctpop_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a0
; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2
+; RV32-NEXT: vxor.vv v16, v12, v16
+; RV32-NEXT: vadd.vv v20, v16, v16
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v20, v24, v20
+; RV32-NEXT: vsub.vv v8, v8, v20
+; RV32-NEXT: vand.vv v20, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v20, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3
+; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -879,34 +839,29 @@ define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
;
; RV64-LABEL: ctpop_nxv4i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
; RV64-NEXT: lui a0, 61681
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: slli a1, a0, 32
; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a0
+; RV64-NEXT: vsll.vi v16, v12, 2
+; RV64-NEXT: vxor.vx v16, v16, a0
+; RV64-NEXT: vadd.vv v20, v16, v16
+; RV64-NEXT: vxor.vv v20, v16, v20
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v20, v24, v20
+; RV64-NEXT: vsub.vv v8, v8, v20
+; RV64-NEXT: vand.vv v20, v8, v16
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v20, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vsrl.vi v12, v12, 3
+; RV64-NEXT: vand.vx v12, v12, a0
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -924,74 +879,107 @@ declare <vscale x 4 x i64> @llvm.ctpop.nxv4i64(<vscale x 4 x i64>)
define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-LABEL: ctpop_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a0
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: ctpop_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64-NEXT: lui a0, 61681
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: slli a1, a0, 32
; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v0, v8, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v0, v0, v16
+; RV64-NEXT: vsub.vv v8, v8, v0
+; RV64-NEXT: vand.vv v0, v8, v24
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vadd.vv v8, v0, v8
+; RV64-NEXT: vsrl.vi v24, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: ctpop_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 2310f85b1fba93..fc7f50e4048666 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1221,37 +1221,27 @@ declare <vscale x 1 x i64> @llvm.vp.ctpop.nxv1i64(<vscale x 1 x i64>, <vscale x
define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT: vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT: vand.vv v11, v12, v11, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT: vand.vv v11, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1259,34 +1249,30 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
;
; RV64-LABEL: vp_ctpop_nxv1i64:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT: vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT: vand.vv v11, v12, v11, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT: vand.vv v11, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v10, v0.t
+; RV64-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1303,37 +1289,27 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1341,34 +1317,30 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
;
; RV64-LABEL: vp_ctpop_nxv1i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a1
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1389,37 +1361,27 @@ declare <vscale x 2 x i64> @llvm.vp.ctpop.nxv2i64(<vscale x 2 x i64>, <vscale x
define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vand.vv v14, v16, v14, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT: vand.vv v14, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1427,34 +1389,30 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
;
; RV64-LABEL: vp_ctpop_nxv2i64:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT: vand.vv v14, v16, v14, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT: vand.vv v14, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1471,37 +1429,27 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1509,34 +1457,30 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
;
; RV64-LABEL: vp_ctpop_nxv2i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a1
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1557,37 +1501,27 @@ declare <vscale x 4 x i64> @llvm.vp.ctpop.nxv4i64(<vscale x 4 x i64>, <vscale x
define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v20, v24, v20, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT: vand.vv v20, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1595,34 +1529,30 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
;
; RV64-LABEL: vp_ctpop_nxv4i64:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT: vand.vv v20, v24, v20, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT: vand.vv v20, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v12, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1639,37 +1569,27 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2
+; RV32-NEXT: vxor.vv v16, v12, v16
+; RV32-NEXT: vadd.vv v20, v16, v16
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v20, v24, v20
+; RV32-NEXT: vsub.vv v8, v8, v20
+; RV32-NEXT: vand.vv v20, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v20, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3
+; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1677,34 +1597,30 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
;
; RV64-LABEL: vp_ctpop_nxv4i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vsll.vi v16, v12, 2
+; RV64-NEXT: vxor.vx v16, v16, a1
+; RV64-NEXT: vadd.vv v20, v16, v16
+; RV64-NEXT: vxor.vv v20, v16, v20
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v20, v24, v20
+; RV64-NEXT: vsub.vv v8, v8, v20
+; RV64-NEXT: vand.vv v20, v8, v16
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v20, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v12, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1725,74 +1641,163 @@ declare <vscale x 7 x i64> @llvm.vp.ctpop.nxv7i64(<vscale x 7 x i64>, <vscale x
define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv7i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT: vand.vv v16, v24, v8, v0.t
+; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT: vand.vv v8, v24, v8, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64:
@@ -1807,74 +1812,98 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv7i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64_unmasked:
@@ -1893,74 +1922,163 @@ declare <vscale x 8 x i64> @llvm.vp.ctpop.nxv8i64(<vscale x 8 x i64>, <vscale x
define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv8i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT: vand.vv v16, v24, v8, v0.t
+; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT: vand.vv v8, v24, v8, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64:
@@ -1975,74 +2093,98 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv8i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64_unmasked:
@@ -2064,20 +2206,19 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: vmv1r.v v24, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
+; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 48
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2089,101 +2230,121 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: lui a3, 349525
-; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: lui a3, 4112
-; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 4
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -2194,98 +2355,112 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 56
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
@@ -2296,82 +2471,242 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; RV64-NEXT: vmv1r.v v24, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: srli a2, a1, 3
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a1, a2, 3
; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vx v24, v0, a2
-; RV64-NEXT: mv a2, a0
-; RV64-NEXT: bltu a0, a1, .LBB46_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: .LBB46_2:
-; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vslidedown.vx v0, v0, a1
+; RV64-NEXT: sub a1, a0, a2
+; RV64-NEXT: sltu a3, a0, a1
+; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: and a3, a3, a1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 40
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT: addi a7, sp, 16
-; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 24
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: bltu a0, a2, .LBB46_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a0, a2
+; RV64-NEXT: .LBB46_2:
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v16, a3, v0.t
-; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a3, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -2407,162 +2742,215 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: lui a3, 349525
-; RV32-NEXT: addi a3, a3, 1365
-; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v0, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vsub.vv v24, v16, v24
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v0, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v24, v0
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vadd.vv v24, v16, v24
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v8, v24, v0
+; RV32-NEXT: vadd.vv v0, v8, v8
+; RV32-NEXT: vxor.vv v16, v8, v0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v0, v0, 1
+; RV32-NEXT: vand.vv v16, v0, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v0, v16
+; RV32-NEXT: vand.vv v0, v16, v8
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: lui a3, 4112
-; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vmul.vv v16, v16, v24
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsrl.vx v16, v16, a2
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB47_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB47_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vsll.vi v8, v24, 2
+; RV32-NEXT: vxor.vv v8, v24, v8
+; RV32-NEXT: vadd.vv v0, v8, v8
+; RV32-NEXT: vxor.vv v16, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vsub.vv v24, v8, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v0, v16, 1
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v0, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v24, v0
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsub.vv v16, v0, v16
+; RV32-NEXT: vand.vv v0, v16, v8
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v8, v24
-; RV32-NEXT: vsrl.vx v8, v8, a2
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_nxv16i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: mv a2, a0
-; RV64-NEXT: bltu a0, a1, .LBB47_2
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: sub a1, a0, a2
+; RV64-NEXT: sltu a3, a0, a1
+; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: and a3, a3, a1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v16, v0, v24
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v16, v24, v16
+; RV64-NEXT: vsub.vv v16, v8, v16
+; RV64-NEXT: vand.vv v24, v16, v0
+; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vmv8r.v v8, v24
+; RV64-NEXT: vsrl.vi v24, v24, 3
+; RV64-NEXT: vand.vx v24, v24, a1
+; RV64-NEXT: vmul.vv v16, v16, v24
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v16, v16, a3
+; RV64-NEXT: addi a4, sp, 16
+; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: bltu a0, a2, .LBB47_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a2, a1
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: .LBB47_2:
-; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v24, v8, a3
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: vand.vx v24, v16, a3
+; RV64-NEXT: vsll.vi v24, v8, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v16, v24, v0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v0, v8, 1
+; RV64-NEXT: vand.vv v16, v0, v16
+; RV64-NEXT: vsub.vv v16, v8, v16
+; RV64-NEXT: vand.vv v0, v16, v24
; RV64-NEXT: vsrl.vi v16, v16, 2
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vand.vv v16, v16, v24
+; RV64-NEXT: vadd.vv v16, v0, v16
; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: vand.vx v16, v16, a4
-; RV64-NEXT: vmul.vx v16, v16, a5
-; RV64-NEXT: vsrl.vx v16, v16, a6
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 3
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vmul.vv v8, v16, v8
+; RV64-NEXT: vsrl.vx v8, v8, a3
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index b14cde25aa85b2..6ee63fc9a12e8a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -1135,41 +1135,31 @@ declare <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32>, i1)
define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-LABEL: cttz_nxv1i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT: vsub.vx v9, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
-; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
-; RV32I-NEXT: vsrl.vi v9, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v9
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v9, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsll.vi v10, v9, 2
+; RV32I-NEXT: vxor.vv v10, v9, v10
+; RV32I-NEXT: vadd.vv v11, v10, v10
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vand.vv v11, v12, v11
+; RV32I-NEXT: vsub.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v11, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v11, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsrl.vi v10, v9, 3
+; RV32I-NEXT: vand.vv v9, v9, v10
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -1177,38 +1167,33 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
;
; RV64I-LABEL: cttz_nxv1i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
-; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV64I-NEXT: vsub.vx v9, v8, a0
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v9
-; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v9, a0
-; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v9, v8
-; RV64I-NEXT: vsrl.vi v9, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v9
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT: vmv.v.x v9, a0
+; RV64I-NEXT: vsll.vi v10, v9, 2
+; RV64I-NEXT: vxor.vx v10, v10, a0
+; RV64I-NEXT: vadd.vv v11, v10, v10
+; RV64I-NEXT: vxor.vv v11, v10, v11
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v12, v8, a1
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vv v11, v12, v11
+; RV64I-NEXT: vsub.vv v8, v8, v11
+; RV64I-NEXT: vand.vv v11, v8, v10
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vadd.vv v8, v11, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v9, v9, 3
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -1298,41 +1283,31 @@ declare <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64>, i1)
define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-LABEL: cttz_nxv2i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT: vsub.vx v10, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
-; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
-; RV32I-NEXT: vsrl.vi v10, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v10, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsll.vi v12, v10, 2
+; RV32I-NEXT: vxor.vv v12, v10, v12
+; RV32I-NEXT: vadd.vv v14, v12, v12
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v16, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vand.vv v14, v16, v14
+; RV32I-NEXT: vsub.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v14, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v14, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsrl.vi v12, v10, 3
+; RV32I-NEXT: vand.vv v10, v10, v12
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -1340,38 +1315,33 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
;
; RV64I-LABEL: cttz_nxv2i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
-; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV64I-NEXT: vsub.vx v10, v8, a0
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v10
-; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v10, a0
-; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v10, v8
-; RV64I-NEXT: vsrl.vi v10, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT: vmv.v.x v10, a0
+; RV64I-NEXT: vsll.vi v12, v10, 2
+; RV64I-NEXT: vxor.vx v12, v12, a0
+; RV64I-NEXT: vadd.vv v14, v12, v12
+; RV64I-NEXT: vxor.vv v14, v12, v14
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v16, v8, a1
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vv v14, v16, v14
+; RV64I-NEXT: vsub.vv v8, v8, v14
+; RV64I-NEXT: vand.vv v14, v8, v12
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vadd.vv v8, v14, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v10, v10, 3
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -1461,41 +1431,31 @@ declare <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64>, i1)
define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-LABEL: cttz_nxv4i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT: vsub.vx v12, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vsrl.vi v12, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v12, v12, v16
-; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v16, v8, v12
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
-; RV32I-NEXT: vsrl.vi v12, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v12
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32I-NEXT: vmv.v.x v12, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsll.vi v16, v12, 2
+; RV32I-NEXT: vxor.vv v16, v12, v16
+; RV32I-NEXT: vadd.vv v20, v16, v16
+; RV32I-NEXT: vxor.vv v20, v16, v20
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v24, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vsrl.vi v24, v8, 1
+; RV32I-NEXT: vand.vv v20, v24, v20
+; RV32I-NEXT: vsub.vv v8, v8, v20
+; RV32I-NEXT: vand.vv v20, v8, v16
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v20, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v16
; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsrl.vi v16, v12, 3
+; RV32I-NEXT: vand.vv v12, v12, v16
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -1503,38 +1463,33 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
;
; RV64I-LABEL: cttz_nxv4i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
-; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV64I-NEXT: vsub.vx v12, v8, a0
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v12
-; RV64I-NEXT: vsrl.vi v12, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v12, a0
-; RV64I-NEXT: vsub.vv v8, v8, v12
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v12, v8
-; RV64I-NEXT: vsrl.vi v12, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v12
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT: vmv.v.x v12, a0
+; RV64I-NEXT: vsll.vi v16, v12, 2
+; RV64I-NEXT: vxor.vx v16, v16, a0
+; RV64I-NEXT: vadd.vv v20, v16, v16
+; RV64I-NEXT: vxor.vv v20, v16, v20
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v24, v8, a1
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v24
+; RV64I-NEXT: vsrl.vi v24, v8, 1
+; RV64I-NEXT: vand.vv v20, v24, v20
+; RV64I-NEXT: vsub.vv v8, v8, v20
+; RV64I-NEXT: vand.vv v20, v8, v16
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vadd.vv v8, v20, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v12, v12, 3
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vmul.vv v8, v8, v12
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -1624,82 +1579,105 @@ declare <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64>, i1)
define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: cttz_nxv8i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT: vsub.vx v16, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vsrl.vi v16, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v24, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v16, v16, v24
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v24, v8, v16
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: sub sp, sp, a0
+; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; RV32I-NEXT: vmv.v.x v16, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsll.vi v24, v16, 2
+; RV32I-NEXT: vxor.vv v24, v16, v24
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v0, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v0
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vadd.vv v0, v24, v24
+; RV32I-NEXT: vxor.vv v0, v24, v0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vsrl.vi v0, v8, 1
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vand.vv v0, v0, v8
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vsub.vv v8, v8, v0
+; RV32I-NEXT: vand.vv v0, v8, v24
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vadd.vv v8, v0, v8
+; RV32I-NEXT: vsrl.vi v24, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v24
; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsrl.vi v24, v16, 3
+; RV32I-NEXT: vand.vv v16, v16, v24
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add sp, sp, a0
+; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_nxv8i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 3
+; RV64I-NEXT: sub sp, sp, a0
+; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: slli a1, a0, 32
+; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64I-NEXT: vsub.vx v16, v8, a0
+; RV64I-NEXT: vmv.v.x v16, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT: vsll.vi v24, v16, 2
+; RV64I-NEXT: vxor.vx v24, v24, a0
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v0, v8, a1
; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vand.vv v8, v8, v0
+; RV64I-NEXT: vadd.vv v0, v24, v24
+; RV64I-NEXT: vxor.vv v0, v24, v0
; RV64I-NEXT: vsrl.vi v16, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vand.vv v16, v16, v0
; RV64I-NEXT: vsub.vv v8, v8, v16
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v8, a0
+; RV64I-NEXT: vand.vv v16, v8, v24
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v24
; RV64I-NEXT: vadd.vv v8, v16, v8
; RV64I-NEXT: vsrl.vi v16, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v16
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT: vsrl.vi v16, v16, 3
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vmul.vv v8, v8, v16
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 3
+; RV64I-NEXT: add sp, sp, a0
+; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
;
; RV32F-LABEL: cttz_nxv8i64:
@@ -2813,41 +2791,31 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv1i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT: vsub.vx v9, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
-; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
-; RV32I-NEXT: vsrl.vi v9, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v9
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v9, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsll.vi v10, v9, 2
+; RV32I-NEXT: vxor.vv v10, v9, v10
+; RV32I-NEXT: vadd.vv v11, v10, v10
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vand.vv v11, v12, v11
+; RV32I-NEXT: vsub.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v11, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v11, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsrl.vi v10, v9, 3
+; RV32I-NEXT: vand.vv v9, v9, v10
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -2855,38 +2823,33 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
;
; RV64I-LABEL: cttz_zero_undef_nxv1i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
-; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV64I-NEXT: vsub.vx v9, v8, a0
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v9
-; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v9, a0
-; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v9, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v9, v8
-; RV64I-NEXT: vsrl.vi v9, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v9
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT: vmv.v.x v9, a0
+; RV64I-NEXT: vsll.vi v10, v9, 2
+; RV64I-NEXT: vxor.vx v10, v10, a0
+; RV64I-NEXT: vadd.vv v11, v10, v10
+; RV64I-NEXT: vxor.vv v11, v10, v11
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v12, v8, a1
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vv v11, v12, v11
+; RV64I-NEXT: vsub.vv v8, v8, v11
+; RV64I-NEXT: vand.vv v11, v8, v10
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vadd.vv v8, v11, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v9, v9, 3
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -2933,41 +2896,31 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv2i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT: vsub.vx v10, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
-; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
-; RV32I-NEXT: vsrl.vi v10, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v10, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsll.vi v12, v10, 2
+; RV32I-NEXT: vxor.vv v12, v10, v12
+; RV32I-NEXT: vadd.vv v14, v12, v12
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v16, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vand.vv v14, v16, v14
+; RV32I-NEXT: vsub.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v14, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v14, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsrl.vi v12, v10, 3
+; RV32I-NEXT: vand.vv v10, v10, v12
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -2975,38 +2928,33 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
;
; RV64I-LABEL: cttz_zero_undef_nxv2i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
-; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV64I-NEXT: vsub.vx v10, v8, a0
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v10
-; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v10, a0
-; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v10, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v10, v8
-; RV64I-NEXT: vsrl.vi v10, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT: vmv.v.x v10, a0
+; RV64I-NEXT: vsll.vi v12, v10, 2
+; RV64I-NEXT: vxor.vx v12, v12, a0
+; RV64I-NEXT: vadd.vv v14, v12, v12
+; RV64I-NEXT: vxor.vv v14, v12, v14
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v16, v8, a1
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vv v14, v16, v14
+; RV64I-NEXT: vsub.vv v8, v8, v14
+; RV64I-NEXT: vand.vv v14, v8, v12
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vadd.vv v8, v14, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v10, v10, 3
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -3053,41 +3001,31 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv4i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT: vsub.vx v12, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vsrl.vi v12, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v12, v12, v16
-; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v16, v8, v12
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
-; RV32I-NEXT: vsrl.vi v12, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v12
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32I-NEXT: vmv.v.x v12, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsll.vi v16, v12, 2
+; RV32I-NEXT: vxor.vv v16, v12, v16
+; RV32I-NEXT: vadd.vv v20, v16, v16
+; RV32I-NEXT: vxor.vv v20, v16, v20
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v24, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vsrl.vi v24, v8, 1
+; RV32I-NEXT: vand.vv v20, v24, v20
+; RV32I-NEXT: vsub.vv v8, v8, v20
+; RV32I-NEXT: vand.vv v20, v8, v16
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v20, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v16
; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsrl.vi v16, v12, 3
+; RV32I-NEXT: vand.vv v12, v12, v16
; RV32I-NEXT: vmul.vv v8, v8, v12
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
@@ -3095,38 +3033,33 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
;
; RV64I-LABEL: cttz_zero_undef_nxv4i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
-; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV64I-NEXT: vsub.vx v12, v8, a0
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v12
-; RV64I-NEXT: vsrl.vi v12, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v12, a0
-; RV64I-NEXT: vsub.vv v8, v8, v12
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v12, v8, a0
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: vadd.vv v8, v12, v8
-; RV64I-NEXT: vsrl.vi v12, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v12
; RV64I-NEXT: lui a0, 61681
; RV64I-NEXT: addiw a0, a0, -241
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT: vmv.v.x v12, a0
+; RV64I-NEXT: vsll.vi v16, v12, 2
+; RV64I-NEXT: vxor.vx v16, v16, a0
+; RV64I-NEXT: vadd.vv v20, v16, v16
+; RV64I-NEXT: vxor.vv v20, v16, v20
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v24, v8, a1
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v24
+; RV64I-NEXT: vsrl.vi v24, v8, 1
+; RV64I-NEXT: vand.vv v20, v24, v20
+; RV64I-NEXT: vsub.vv v8, v8, v20
+; RV64I-NEXT: vand.vv v20, v8, v16
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vadd.vv v8, v20, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v12, v12, 3
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vmul.vv v8, v8, v12
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
; RV64I-NEXT: ret
@@ -3173,82 +3106,105 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: cttz_zero_undef_nxv8i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a0, 1
-; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT: vsub.vx v16, v8, a0
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vsrl.vi v16, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v24, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v16, v16, v24
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v24, v8, v16
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: sub sp, sp, a0
+; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; RV32I-NEXT: vmv.v.x v16, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsll.vi v24, v16, 2
+; RV32I-NEXT: vxor.vv v24, v16, v24
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsub.vx v0, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v0
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vadd.vv v0, v24, v24
+; RV32I-NEXT: vxor.vv v0, v24, v0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vsrl.vi v0, v8, 1
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vand.vv v0, v0, v8
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vsub.vv v8, v8, v0
+; RV32I-NEXT: vand.vv v0, v8, v24
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vadd.vv v8, v0, v8
+; RV32I-NEXT: vsrl.vi v24, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v24
; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 4112
-; RV32I-NEXT: addi a0, a0, 257
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsrl.vi v24, v16, 3
+; RV32I-NEXT: vand.vv v16, v16, v24
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add sp, sp, a0
+; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: cttz_zero_undef_nxv8i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 3
+; RV64I-NEXT: sub sp, sp, a0
+; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: slli a1, a0, 32
+; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64I-NEXT: vsub.vx v16, v8, a0
+; RV64I-NEXT: vmv.v.x v16, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT: vsll.vi v24, v16, 2
+; RV64I-NEXT: vxor.vx v24, v24, a0
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: vsub.vx v0, v8, a1
; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vand.vv v8, v8, v0
+; RV64I-NEXT: vadd.vv v0, v24, v24
+; RV64I-NEXT: vxor.vv v0, v24, v0
; RV64I-NEXT: vsrl.vi v16, v8, 1
-; RV64I-NEXT: lui a0, 349525
-; RV64I-NEXT: addiw a0, a0, 1365
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vand.vv v16, v16, v0
; RV64I-NEXT: vsub.vv v8, v8, v16
-; RV64I-NEXT: lui a0, 209715
-; RV64I-NEXT: addiw a0, a0, 819
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vand.vx v16, v8, a0
+; RV64I-NEXT: vand.vv v16, v8, v24
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vand.vv v8, v8, v24
; RV64I-NEXT: vadd.vv v8, v16, v8
; RV64I-NEXT: vsrl.vi v16, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v16
-; RV64I-NEXT: lui a0, 61681
-; RV64I-NEXT: addiw a0, a0, -241
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: vand.vx v8, v8, a0
-; RV64I-NEXT: lui a0, 4112
-; RV64I-NEXT: addiw a0, a0, 257
-; RV64I-NEXT: slli a1, a0, 32
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: addi a1, sp, 16
+; RV64I-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT: vsrl.vi v16, v16, 3
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vmul.vv v8, v8, v16
; RV64I-NEXT: li a0, 56
; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: csrr a0, vlenb
+; RV64I-NEXT: slli a0, a0, 3
+; RV64I-NEXT: add sp, sp, a0
+; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
;
; CHECK-F-LABEL: cttz_zero_undef_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 145ce6e917f962..071b76899e7523 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -1365,41 +1365,31 @@ declare <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64>, i1 immarg,
define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT: vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v12, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT: vand.vv v11, v12, v11, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT: vand.vv v11, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1407,38 +1397,34 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
;
; RV64-LABEL: vp_cttz_nxv1i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT: vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v12, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v9, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT: vand.vv v11, v12, v11, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT: vand.vv v11, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v10, v0.t
+; RV64-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1455,41 +1441,31 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv1i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v12, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1497,38 +1473,34 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
;
; RV64-LABEL: vp_cttz_nxv1i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a1
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a1
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v12, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1549,41 +1521,31 @@ declare <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64>, i1 immarg,
define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vand.vv v14, v16, v14, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT: vand.vv v14, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1591,38 +1553,34 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
;
; RV64-LABEL: vp_cttz_nxv2i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v10, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT: vand.vv v14, v16, v14, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT: vand.vv v14, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1639,41 +1597,31 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v16, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1681,38 +1629,34 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
;
; RV64-LABEL: vp_cttz_nxv2i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a1
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a1
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v16, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1733,41 +1677,31 @@ declare <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64>, i1 immarg,
define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v24, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v20, v24, v20, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT: vand.vv v20, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1775,38 +1709,34 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
;
; RV64-LABEL: vp_cttz_nxv4i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v24, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v12, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
+; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT: vand.vv v20, v24, v20, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT: vand.vv v20, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v12, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1823,41 +1753,31 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2
+; RV32-NEXT: vxor.vv v16, v12, v16
+; RV32-NEXT: vadd.vv v20, v16, v16
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v24, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v20, v24, v20
+; RV32-NEXT: vsub.vv v8, v8, v20
+; RV32-NEXT: vand.vv v20, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v20, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3
+; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1865,38 +1785,34 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
;
; RV64-LABEL: vp_cttz_nxv4i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a1
+; RV64-NEXT: vsll.vi v16, v12, 2
+; RV64-NEXT: vxor.vx v16, v16, a1
+; RV64-NEXT: vadd.vv v20, v16, v16
+; RV64-NEXT: vxor.vv v20, v16, v20
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v24, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v20, v24, v20
+; RV64-NEXT: vsub.vv v8, v8, v20
+; RV64-NEXT: vand.vv v20, v8, v16
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v20, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v12, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1917,82 +1833,182 @@ declare <vscale x 7 x i64> @llvm.vp.cttz.nxv7i64(<vscale x 7 x i64>, i1 immarg,
define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv7i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv7i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
+; RV64-NEXT: vand.vv v16, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64:
@@ -2007,82 +2023,106 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv7i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v0, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv7i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v0, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64_unmasked:
@@ -2101,82 +2141,182 @@ declare <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64>, i1 immarg,
define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
+; RV64-NEXT: vand.vv v16, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64:
@@ -2191,82 +2331,106 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_nxv8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v0, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_nxv8i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v0, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64_unmasked:
@@ -2288,13 +2452,19 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: vmv1r.v v24, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2305,120 +2475,114 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: li a2, 1
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a2, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: lui a4, 209715
-; RV32-NEXT: addi a4, a4, 819
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: lui a4, 61681
-; RV32-NEXT: addi a4, a4, -241
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: lui a4, 4112
-; RV32-NEXT: addi a4, a4, 257
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB46_2
; RV32-NEXT: # %bb.1:
@@ -2427,107 +2591,113 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vx v8, v16, a2, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v8, v16, a2, v0.t
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vnot.v v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 56
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
@@ -2538,89 +2708,235 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV64-NEXT: vmv1r.v v24, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: srli a2, a1, 3
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a1, a2, 3
; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vx v0, v0, a2
-; RV64-NEXT: sub a2, a0, a1
-; RV64-NEXT: sltu a3, a0, a2
+; RV64-NEXT: vslidedown.vx v0, v0, a1
+; RV64-NEXT: sub a1, a0, a2
+; RV64-NEXT: sltu a3, a0, a1
; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a3, a3, a2
-; RV64-NEXT: li a2, 1
+; RV64-NEXT: and a3, a3, a1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 40
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v8, v16, a2, v0.t
-; RV64-NEXT: vnot.v v16, v16, v0.t
-; RV64-NEXT: vand.vv v16, v16, v8, v0.t
-; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV64-NEXT: lui a3, 349525
-; RV64-NEXT: addiw a3, a3, 1365
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
-; RV64-NEXT: vsub.vv v16, v16, v8, v0.t
-; RV64-NEXT: lui a4, 209715
-; RV64-NEXT: addiw a4, a4, 819
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v16, a4, v0.t
-; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: li a3, 1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v8, v16, a3, v0.t
+; RV64-NEXT: addi a4, sp, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vnot.v v8, v16, v0.t
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 3
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a5, 61681
-; RV64-NEXT: addiw a5, a5, -241
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vand.vx v8, v8, a5, v0.t
-; RV64-NEXT: lui a6, 4112
-; RV64-NEXT: addiw a6, a6, 257
-; RV64-NEXT: slli a7, a6, 32
-; RV64-NEXT: add a6, a6, a7
-; RV64-NEXT: vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT: li a7, 56
-; RV64-NEXT: vsrl.vx v8, v8, a7, v0.t
-; RV64-NEXT: addi t0, sp, 16
-; RV64-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
-; RV64-NEXT: bltu a0, a1, .LBB46_2
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 40
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: li a4, 56
+; RV64-NEXT: vsrl.vx v8, v8, a4, v0.t
+; RV64-NEXT: csrr a5, vlenb
+; RV64-NEXT: slli a5, a5, 4
+; RV64-NEXT: add a5, sp, a5
+; RV64-NEXT: addi a5, a5, 16
+; RV64-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV64-NEXT: bltu a0, a2, .LBB46_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a0, a1
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: .LBB46_2:
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsub.vx v16, v8, a2, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v8, v16, a3, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vnot.v v8, v16, v0.t
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a3, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v16, v8, a4, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a5, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT: vsrl.vx v8, v8, a7, v0.t
-; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v8, v8, a4, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -2656,12 +2972,12 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
+; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2669,62 +2985,53 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: sltu a3, a0, a2
; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: li a2, 1
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a2
+; RV32-NEXT: vsub.vx v0, v16, a2
; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vand.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v24, v8, 1
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: lui a4, 209715
-; RV32-NEXT: addi a4, a4, 819
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v0, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: lui a4, 61681
-; RV32-NEXT: addi a4, a4, -241
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: lui a4, 4112
-; RV32-NEXT: addi a4, a4, 257
-; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v0
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vmul.vv v16, v16, v8
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a3, 56
-; RV32-NEXT: vsrl.vx v8, v16, a3
-; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vsrl.vx v8, v8, a3
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB47_2
; RV32-NEXT: # %bb.1:
@@ -2732,45 +3039,52 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV32-NEXT: .LBB47_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vx v16, v24, a2
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsrl.vi v24, v16, 1
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v8, v16, a2
+; RV32-NEXT: vnot.v v0, v16
+; RV32-NEXT: vand.vv v16, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v8, v0, v0
+; RV32-NEXT: vxor.vv v8, v0, v8
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8
-; RV32-NEXT: vsub.vv v16, v16, v24
-; RV32-NEXT: vand.vv v24, v16, v0
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: vsrl.vi v24, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v16, v8
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v8, v8, a3
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8
-; RV32-NEXT: vsrl.vx v8, v8, a3
-; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
@@ -2778,65 +3092,109 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
;
; RV64-LABEL: vp_cttz_nxv16i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: sub a2, a0, a1
-; RV64-NEXT: sltu a3, a0, a2
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: sub a1, a0, a2
+; RV64-NEXT: sltu a3, a0, a1
; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a3, a3, a2
-; RV64-NEXT: li a2, 1
+; RV64-NEXT: and a3, a3, a1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v24, v16, a2
+; RV64-NEXT: li a3, 1
+; RV64-NEXT: vsub.vx v0, v16, a3
; RV64-NEXT: vnot.v v16, v16
-; RV64-NEXT: vand.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: lui a3, 349525
-; RV64-NEXT: addiw a3, a3, 1365
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v24, v24, a3
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: lui a4, 209715
-; RV64-NEXT: addiw a4, a4, 819
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v24, v16, a4
-; RV64-NEXT: vsrl.vi v16, v16, 2
-; RV64-NEXT: vand.vx v16, v16, a4
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: lui a5, 61681
-; RV64-NEXT: addiw a5, a5, -241
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vand.vx v16, v16, a5
-; RV64-NEXT: lui a6, 4112
-; RV64-NEXT: addiw a6, a6, 257
-; RV64-NEXT: slli a7, a6, 32
-; RV64-NEXT: add a6, a6, a7
-; RV64-NEXT: vmul.vx v16, v16, a6
-; RV64-NEXT: li a7, 56
-; RV64-NEXT: vsrl.vx v16, v16, a7
-; RV64-NEXT: bltu a0, a1, .LBB47_2
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: vsrl.vi v8, v16, 1
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v16, v8, v0
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT: vmv8r.v v0, v16
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: li a4, 56
+; RV64-NEXT: vsrl.vx v8, v8, a4
+; RV64-NEXT: addi a5, sp, 16
+; RV64-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV64-NEXT: bltu a0, a2, .LBB47_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a0, a1
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: .LBB47_2:
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v24, v8, a2
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: vand.vx v24, v24, a3
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: vand.vx v24, v8, a4
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v8, v16, a3
+; RV64-NEXT: vnot.v v24, v16
+; RV64-NEXT: vand.vv v8, v24, v8
+; RV64-NEXT: vsll.vi v24, v0, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v16, v24, v0
+; RV64-NEXT: vsrl.vi v0, v8, 1
+; RV64-NEXT: vand.vv v16, v0, v16
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a4
-; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: vand.vx v8, v8, a5
-; RV64-NEXT: vmul.vx v8, v8, a6
-; RV64-NEXT: vsrl.vx v8, v8, a7
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vsrl.vx v8, v8, a4
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 36f22bd3259cf9..8ab3ee6206de73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -913,35 +913,26 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
; RV32-NEXT: vor.vv v8, v8, v9, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: vsll.vi v12, v11, 1, v0.t
+; RV32-NEXT: vxor.vv v12, v11, v12, v0.t
+; RV32-NEXT: vand.vv v9, v9, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
+; RV32-NEXT: vand.vv v9, v8, v11, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v11, v0.t
+; RV32-NEXT: vadd.vv v8, v9, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v9, v10, 3, v0.t
+; RV32-NEXT: vand.vv v9, v10, v9, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -960,37 +951,34 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v9, v0.t
; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v9, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v9, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v9, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT: vsll.vi v11, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v11, v11, a1, v0.t
+; RV64-NEXT: vsll.vi v12, v11, 1, v0.t
+; RV64-NEXT: vxor.vv v12, v11, v12, v0.t
+; RV64-NEXT: vand.vv v9, v9, v12, v0.t
; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vand.vv v9, v8, v11, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v11, v0.t
; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v10, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1017,35 +1005,26 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
+; RV32-NEXT: vsll.vi v11, v10, 2
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: vadd.vv v12, v11, v11
+; RV32-NEXT: vxor.vv v12, v11, v12
+; RV32-NEXT: vand.vv v9, v9, v12
; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
+; RV32-NEXT: vand.vv v9, v8, v11
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
+; RV32-NEXT: vand.vv v8, v8, v11
+; RV32-NEXT: vadd.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vsrl.vi v9, v10, 3
+; RV32-NEXT: vand.vv v9, v10, v9
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1064,37 +1043,34 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vsrl.vi v9, v8, 16
; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v9, v8, a1
; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vnot.v v8, v8
; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT: vsll.vi v11, v10, 2
+; RV64-NEXT: vxor.vx v11, v11, a1
+; RV64-NEXT: vadd.vv v12, v11, v11
+; RV64-NEXT: vxor.vv v12, v11, v12
+; RV64-NEXT: vand.vv v9, v9, v12
; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vv v9, v8, v11
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v11
; RV64-NEXT: vadd.vv v8, v9, v8
; RV64-NEXT: vsrl.vi v9, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v10, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1125,35 +1101,26 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: vsll.vi v16, v14, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v14, v16, v0.t
+; RV32-NEXT: vand.vv v10, v10, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
+; RV32-NEXT: vand.vv v10, v8, v14, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v14, v0.t
+; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v10, v12, 3, v0.t
+; RV32-NEXT: vand.vv v10, v12, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1172,37 +1139,34 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v10, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v10, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v10, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT: vsll.vi v14, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v14, v14, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v14, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v14, v16, v0.t
+; RV64-NEXT: vand.vv v10, v10, v16, v0.t
; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vand.vv v10, v8, v14, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v14, v0.t
; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v12, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1229,35 +1193,26 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
+; RV32-NEXT: vsll.vi v14, v12, 2
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: vadd.vv v16, v14, v14
+; RV32-NEXT: vxor.vv v16, v14, v16
+; RV32-NEXT: vand.vv v10, v10, v16
; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
+; RV32-NEXT: vand.vv v10, v8, v14
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
+; RV32-NEXT: vand.vv v8, v8, v14
+; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vsrl.vi v10, v12, 3
+; RV32-NEXT: vand.vv v10, v12, v10
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1276,37 +1231,34 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vsrl.vi v10, v8, 16
; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v10, v8, a1
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vnot.v v8, v8
; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT: vsll.vi v14, v12, 2
+; RV64-NEXT: vxor.vx v14, v14, a1
+; RV64-NEXT: vadd.vv v16, v14, v14
+; RV64-NEXT: vxor.vv v16, v14, v16
+; RV64-NEXT: vand.vv v10, v10, v16
; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vand.vv v10, v8, v14
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v14
; RV64-NEXT: vadd.vv v8, v10, v8
; RV64-NEXT: vsrl.vi v10, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v12, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1335,38 +1287,29 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: vnot.v v12, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v12, 1, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsll.vi v20, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v20, v8, v20, v0.t
+; RV32-NEXT: vsll.vi v24, v20, 1, v0.t
+; RV32-NEXT: vxor.vv v24, v20, v24, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT: vand.vv v16, v12, v20, v0.t
+; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT: vand.vv v12, v12, v20, v0.t
+; RV32-NEXT: vadd.vv v12, v16, v12, v0.t
+; RV32-NEXT: vsrl.vi v16, v12, 4, v0.t
+; RV32-NEXT: vadd.vv v12, v12, v16, v0.t
+; RV32-NEXT: vand.vv v12, v12, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vmul.vv v8, v12, v8, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV32-NEXT: ret
@@ -1384,37 +1327,34 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v12, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v12, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v12, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vnot.v v12, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v12, 1, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT: vsll.vi v20, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v20, v20, a1, v0.t
+; RV64-NEXT: vsll.vi v24, v20, 1, v0.t
+; RV64-NEXT: vxor.vv v24, v20, v24, v0.t
+; RV64-NEXT: vand.vv v16, v16, v24, v0.t
+; RV64-NEXT: vsub.vv v12, v12, v16, v0.t
+; RV64-NEXT: vand.vv v16, v12, v20, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 2, v0.t
+; RV64-NEXT: vand.vv v12, v12, v20, v0.t
+; RV64-NEXT: vadd.vv v12, v16, v12, v0.t
+; RV64-NEXT: vsrl.vi v16, v12, 4, v0.t
+; RV64-NEXT: vadd.vv v12, v12, v16, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsrl.vi v8, v8, 3, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v12, v8, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1441,35 +1381,26 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
+; RV32-NEXT: vsll.vi v20, v16, 2
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vadd.vv v24, v20, v20
+; RV32-NEXT: vxor.vv v24, v20, v24
+; RV32-NEXT: vand.vv v12, v12, v24
; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
+; RV32-NEXT: vand.vv v12, v8, v20
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vand.vv v8, v8, v20
+; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v12, v16, 3
+; RV32-NEXT: vand.vv v12, v16, v12
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1488,37 +1419,34 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vsrl.vi v12, v8, 16
; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v12, v8, a1
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vnot.v v8, v8
; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT: vsll.vi v20, v16, 2
+; RV64-NEXT: vxor.vx v20, v20, a1
+; RV64-NEXT: vadd.vv v24, v20, v20
+; RV64-NEXT: vxor.vv v24, v20, v24
+; RV64-NEXT: vand.vv v12, v12, v24
; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vand.vv v12, v8, v20
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v20
; RV64-NEXT: vadd.vv v8, v12, v8
; RV64-NEXT: vsrl.vi v12, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v16, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1533,24 +1461,13 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -1566,40 +1483,80 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v15i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
@@ -1611,39 +1568,65 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT: vand.vv v24, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
ret <15 x i64> %v
@@ -1652,24 +1635,12 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
@@ -1685,40 +1656,58 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v0, v8, 2
+; RV32-NEXT: vxor.vv v0, v8, v0
+; RV32-NEXT: vadd.vv v24, v0, v0
+; RV32-NEXT: vxor.vv v24, v0, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: vsrl.vi v24, v8, 3
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmul.vv v8, v16, v8
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v15i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: vor.vv v8, v8, v16
@@ -1730,39 +1719,54 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vsrl.vi v16, v8, 16
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vnot.v v16, v8
+; RV64-NEXT: vsrl.vi v8, v16, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v24, v8, v24
+; RV64-NEXT: vsub.vv v16, v16, v24
+; RV64-NEXT: vand.vv v24, v16, v0
+; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 3
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vmul.vv v8, v16, v8
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <15 x i1> poison, i1 true, i32 0
%m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -1775,24 +1779,13 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -1808,40 +1801,80 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vp_ctlz_v16i64:
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vp_ctlz_v16i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
@@ -1853,39 +1886,65 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT: vand.vv v24, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
ret <16 x i64> %v
@@ -1894,24 +1953,12 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
@@ -1927,40 +1974,58 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v0, v8, 2
+; RV32-NEXT: vxor.vv v0, v8, v0
+; RV32-NEXT: vadd.vv v24, v0, v0
+; RV32-NEXT: vxor.vv v24, v0, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: vsrl.vi v24, v8, 3
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmul.vv v8, v16, v8
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v16i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: vor.vv v8, v8, v16
@@ -1972,39 +2037,54 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vsrl.vi v16, v8, 16
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vnot.v v16, v8
+; RV64-NEXT: vsrl.vi v8, v16, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v24, v8, v24
+; RV64-NEXT: vsub.vv v16, v16, v24
+; RV64-NEXT: vand.vv v24, v16, v0
+; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 3
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vmul.vv v8, v16, v8
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <16 x i1> poison, i1 true, i32 0
%m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -2017,39 +2097,24 @@ declare <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: li a1, 16
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a3, .LBB34_2
+; RV32-NEXT: bltu a0, a1, .LBB34_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB34_2:
@@ -2069,113 +2134,114 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
@@ -2184,9 +2250,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
@@ -2201,81 +2268,119 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 56
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v32i64:
@@ -2283,23 +2388,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT: li a2, 16
+; RV64-NEXT: li a1, 16
; RV64-NEXT: vslidedown.vi v24, v0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB34_2
+; RV64-NEXT: mv a3, a0
+; RV64-NEXT: bltu a0, a1, .LBB34_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a1, 16
+; RV64-NEXT: li a3, 16
; RV64-NEXT: .LBB34_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
@@ -2310,81 +2417,238 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: li a1, 32
-; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT: li a2, 32
+; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
-; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT: addi a7, sp, 16
-; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 40
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
+; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: and a0, a0, a4
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a0, a0, a4
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vor.vv v16, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV64-NEXT: vor.vv v8, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsll.vi v16, v16, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -2395,160 +2659,170 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT: vmv8r.v v24, v16
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: li a2, 16
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a2, .LBB35_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v8, a2
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v0
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 1
-; RV32-NEXT: vor.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vx v24, v8, a2
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v24, v8, 1
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vor.vv v16, v8, v16
+; RV32-NEXT: vsrl.vi v0, v16, 2
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v16, 4
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v16, 8
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v0, v16, a2
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vnot.v v8, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v32i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a2, 16
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB35_2
-; RV64-NEXT: # %bb.1:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; RV64-NEXT: li a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: bltu a0, a1, .LBB35_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB35_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 1
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 2
@@ -2559,69 +2833,117 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 16
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: li a1, 32
-; RV64-NEXT: vsrl.vx v24, v8, a1
+; RV64-NEXT: li a3, 32
+; RV64-NEXT: vsrl.vx v24, v8, a3
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v16, 2
+; RV64-NEXT: vmv8r.v v24, v16
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v16, v0, v0
+; RV64-NEXT: vxor.vv v16, v0, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: vand.vv v16, v8, v0
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v16, v24, 3
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: li a2, 56
+; RV64-NEXT: vsrl.vx v8, v8, a2
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 2
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 8
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 16
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vx v24, v16, a1
-; RV64-NEXT: vor.vv v16, v16, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a0, a0, a4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vor.vv v16, v8, v16
+; RV64-NEXT: vsrl.vi v0, v16, 2
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vi v0, v16, 4
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vi v0, v16, 8
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vi v0, v16, 16
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vx v0, v16, a3
+; RV64-NEXT: vor.vv v16, v16, v0
; RV64-NEXT: vnot.v v16, v16
+; RV64-NEXT: vsll.vi v0, v24, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v8, v0, v24
; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: vand.vx v24, v16, a3
-; RV64-NEXT: vsrl.vi v16, v16, 2
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: vand.vx v16, v16, a4
-; RV64-NEXT: vmul.vx v16, v16, a5
-; RV64-NEXT: vsrl.vx v16, v16, a6
+; RV64-NEXT: vand.vv v8, v24, v8
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v16, v8, v0
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vsrl.vx v16, v8, a2
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <32 x i1> poison, i1 true, i32 0
%m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
@@ -3512,35 +3834,26 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
; RV32-NEXT: vor.vv v8, v8, v9, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: vsll.vi v12, v11, 1, v0.t
+; RV32-NEXT: vxor.vv v12, v11, v12, v0.t
+; RV32-NEXT: vand.vv v9, v9, v12, v0.t
; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
+; RV32-NEXT: vand.vv v9, v8, v11, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v11, v0.t
+; RV32-NEXT: vadd.vv v8, v9, v8, v0.t
; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v9, v10, 3, v0.t
+; RV32-NEXT: vand.vv v9, v10, v9, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -3559,37 +3872,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
; RV64-NEXT: vor.vv v8, v8, v9, v0.t
; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v9, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v9, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v9, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT: vsll.vi v11, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v11, v11, a1, v0.t
+; RV64-NEXT: vsll.vi v12, v11, 1, v0.t
+; RV64-NEXT: vxor.vv v12, v11, v12, v0.t
+; RV64-NEXT: vand.vv v9, v9, v12, v0.t
; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vand.vv v9, v8, v11, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v11, v0.t
; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v10, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -3616,35 +3926,26 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
+; RV32-NEXT: vsll.vi v11, v10, 2
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: vadd.vv v12, v11, v11
+; RV32-NEXT: vxor.vv v12, v11, v12
+; RV32-NEXT: vand.vv v9, v9, v12
; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
+; RV32-NEXT: vand.vv v9, v8, v11
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
+; RV32-NEXT: vand.vv v8, v8, v11
+; RV32-NEXT: vadd.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vsrl.vi v9, v10, 3
+; RV32-NEXT: vand.vv v9, v10, v9
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -3663,37 +3964,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vsrl.vi v9, v8, 16
; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v9, v8, a1
; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vnot.v v8, v8
; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT: vsll.vi v11, v10, 2
+; RV64-NEXT: vxor.vx v11, v11, a1
+; RV64-NEXT: vadd.vv v12, v11, v11
+; RV64-NEXT: vxor.vv v12, v11, v12
+; RV64-NEXT: vand.vv v9, v9, v12
; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vv v9, v8, v11
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v11
; RV64-NEXT: vadd.vv v8, v9, v8
; RV64-NEXT: vsrl.vi v9, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v10, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -3722,35 +4020,26 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
; RV32-NEXT: vor.vv v8, v8, v10, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: vsll.vi v16, v14, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v14, v16, v0.t
+; RV32-NEXT: vand.vv v10, v10, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
+; RV32-NEXT: vand.vv v10, v8, v14, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v14, v0.t
+; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v10, v12, 3, v0.t
+; RV32-NEXT: vand.vv v10, v12, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -3769,37 +4058,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
; RV64-NEXT: vor.vv v8, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v10, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v10, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v10, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT: vsll.vi v14, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v14, v14, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v14, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v14, v16, v0.t
+; RV64-NEXT: vand.vv v10, v10, v16, v0.t
; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vand.vv v10, v8, v14, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v14, v0.t
; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v12, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -3826,35 +4112,26 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
+; RV32-NEXT: vsll.vi v14, v12, 2
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: vadd.vv v16, v14, v14
+; RV32-NEXT: vxor.vv v16, v14, v16
+; RV32-NEXT: vand.vv v10, v10, v16
; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
+; RV32-NEXT: vand.vv v10, v8, v14
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
+; RV32-NEXT: vand.vv v8, v8, v14
+; RV32-NEXT: vadd.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vsrl.vi v10, v12, 3
+; RV32-NEXT: vand.vv v10, v12, v10
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -3873,37 +4150,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vsrl.vi v10, v8, 16
; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v10, v8, a1
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vnot.v v8, v8
; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT: vsll.vi v14, v12, 2
+; RV64-NEXT: vxor.vx v14, v14, a1
+; RV64-NEXT: vadd.vv v16, v14, v14
+; RV64-NEXT: vxor.vv v16, v14, v16
+; RV64-NEXT: vand.vv v10, v10, v16
; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vand.vv v10, v8, v14
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v14
; RV64-NEXT: vadd.vv v8, v10, v8
; RV64-NEXT: vsrl.vi v10, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v12, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -3930,38 +4204,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT: vnot.v v12, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v12, 1, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsll.vi v20, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v20, v8, v20, v0.t
+; RV32-NEXT: vsll.vi v24, v20, 1, v0.t
+; RV32-NEXT: vxor.vv v24, v20, v24, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT: vand.vv v16, v12, v20, v0.t
+; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT: vand.vv v12, v12, v20, v0.t
+; RV32-NEXT: vadd.vv v12, v16, v12, v0.t
+; RV32-NEXT: vsrl.vi v16, v12, 4, v0.t
+; RV32-NEXT: vadd.vv v12, v12, v16, v0.t
+; RV32-NEXT: vand.vv v12, v12, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vmul.vv v8, v12, v8, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV32-NEXT: ret
@@ -3979,37 +4244,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
; RV64-NEXT: vor.vv v8, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v12, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v12, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v12, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vnot.v v12, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v12, 1, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT: vsll.vi v20, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v20, v20, a1, v0.t
+; RV64-NEXT: vsll.vi v24, v20, 1, v0.t
+; RV64-NEXT: vxor.vv v24, v20, v24, v0.t
+; RV64-NEXT: vand.vv v16, v16, v24, v0.t
+; RV64-NEXT: vsub.vv v12, v12, v16, v0.t
+; RV64-NEXT: vand.vv v16, v12, v20, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 2, v0.t
+; RV64-NEXT: vand.vv v12, v12, v20, v0.t
+; RV64-NEXT: vadd.vv v12, v16, v12, v0.t
+; RV64-NEXT: vsrl.vi v16, v12, 4, v0.t
+; RV64-NEXT: vadd.vv v12, v12, v16, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsrl.vi v8, v8, 3, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v12, v8, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -4036,35 +4298,26 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
+; RV32-NEXT: vsll.vi v20, v16, 2
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vadd.vv v24, v20, v20
+; RV32-NEXT: vxor.vv v24, v20, v24
+; RV32-NEXT: vand.vv v12, v12, v24
; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
+; RV32-NEXT: vand.vv v12, v8, v20
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vand.vv v8, v8, v20
+; RV32-NEXT: vadd.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v12, v16, 3
+; RV32-NEXT: vand.vv v12, v16, v12
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -4083,37 +4336,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vsrl.vi v12, v8, 16
; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v12, v8, a1
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vnot.v v8, v8
; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT: vsll.vi v20, v16, 2
+; RV64-NEXT: vxor.vx v20, v20, a1
+; RV64-NEXT: vadd.vv v24, v20, v20
+; RV64-NEXT: vxor.vv v24, v20, v24
+; RV64-NEXT: vand.vv v12, v12, v24
; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vand.vv v12, v8, v20
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v20
; RV64-NEXT: vadd.vv v8, v12, v8
; RV64-NEXT: vsrl.vi v12, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v16, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -4126,24 +4376,13 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4159,40 +4398,80 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v15i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4204,39 +4483,65 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT: vand.vv v24, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
ret <15 x i64> %v
@@ -4245,24 +4550,12 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
@@ -4278,40 +4571,58 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v0, v8, 2
+; RV32-NEXT: vxor.vv v0, v8, v0
+; RV32-NEXT: vadd.vv v24, v0, v0
+; RV32-NEXT: vxor.vv v24, v0, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: vsrl.vi v24, v8, 3
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmul.vv v8, v16, v8
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: vor.vv v8, v8, v16
@@ -4323,39 +4634,54 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vsrl.vi v16, v8, 16
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vnot.v v16, v8
+; RV64-NEXT: vsrl.vi v8, v16, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v24, v8, v24
+; RV64-NEXT: vsub.vv v16, v16, v24
+; RV64-NEXT: vand.vv v24, v16, v0
+; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 3
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vmul.vv v8, v16, v8
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <15 x i1> poison, i1 true, i32 0
%m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -4366,24 +4692,13 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4399,40 +4714,80 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v16i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4444,39 +4799,65 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT: vand.vv v24, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
ret <16 x i64> %v
@@ -4485,24 +4866,12 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
@@ -4518,40 +4887,58 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v0, v8, 2
+; RV32-NEXT: vxor.vv v0, v8, v0
+; RV32-NEXT: vadd.vv v24, v0, v0
+; RV32-NEXT: vxor.vv v24, v0, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: vsrl.vi v24, v8, 3
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmul.vv v8, v16, v8
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: vor.vv v8, v8, v16
@@ -4563,39 +4950,54 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vsrl.vi v16, v8, 16
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsrl.vx v16, v8, a1
; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vnot.v v16, v8
+; RV64-NEXT: vsrl.vi v8, v16, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v24, v8, v24
+; RV64-NEXT: vsub.vv v16, v16, v24
+; RV64-NEXT: vand.vv v24, v16, v0
+; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 3
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vmul.vv v8, v16, v8
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <16 x i1> poison, i1 true, i32 0
%m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -4606,39 +5008,24 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: li a1, 16
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a3, .LBB70_2
+; RV32-NEXT: bltu a0, a1, .LBB70_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB70_2:
@@ -4658,113 +5045,114 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
@@ -4773,9 +5161,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
@@ -4790,81 +5179,119 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 56
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v32i64:
@@ -4872,23 +5299,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT: li a2, 16
+; RV64-NEXT: li a1, 16
; RV64-NEXT: vslidedown.vi v24, v0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB70_2
+; RV64-NEXT: mv a3, a0
+; RV64-NEXT: bltu a0, a1, .LBB70_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a1, 16
+; RV64-NEXT: li a3, 16
; RV64-NEXT: .LBB70_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
@@ -4899,81 +5328,238 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: li a1, 32
-; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT: li a2, 32
+; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 40
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vxor.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT: addi a7, sp, 16
-; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a0, a0, a4
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vor.vv v16, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV64-NEXT: vor.vv v8, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV64-NEXT: vor.vv v8, v8, v16, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsll.vi v16, v16, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -4984,160 +5570,170 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT: vmv8r.v v24, v16
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: li a2, 16
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a2, .LBB71_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB71_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v8, a2
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v0
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 1
-; RV32-NEXT: vor.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vi v24, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vsrl.vx v24, v8, a2
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v24, v8, 1
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vor.vv v16, v8, v16
+; RV32-NEXT: vsrl.vi v0, v16, 2
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v16, 4
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v16, 8
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v0, v16, a2
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vnot.v v8, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a2, 16
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB71_2
-; RV64-NEXT: # %bb.1:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; RV64-NEXT: li a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: bltu a0, a1, .LBB71_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB71_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 1
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 2
@@ -5148,69 +5744,117 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 16
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: li a1, 32
-; RV64-NEXT: vsrl.vx v24, v8, a1
+; RV64-NEXT: li a3, 32
+; RV64-NEXT: vsrl.vx v24, v8, a3
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a4, a1, 32
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v16, 2
+; RV64-NEXT: vmv8r.v v24, v16
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v16, v0, v0
+; RV64-NEXT: vxor.vv v16, v0, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: vand.vv v16, v8, v0
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v16, v24, 3
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: li a2, 56
+; RV64-NEXT: vsrl.vx v8, v8, a2
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 2
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 8
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 16
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsrl.vx v24, v16, a1
-; RV64-NEXT: vor.vv v16, v16, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a0, a0, a4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vor.vv v16, v8, v16
+; RV64-NEXT: vsrl.vi v0, v16, 2
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vi v0, v16, 4
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vi v0, v16, 8
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vi v0, v16, 16
+; RV64-NEXT: vor.vv v16, v16, v0
+; RV64-NEXT: vsrl.vx v0, v16, a3
+; RV64-NEXT: vor.vv v16, v16, v0
; RV64-NEXT: vnot.v v16, v16
+; RV64-NEXT: vsll.vi v0, v24, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v8, v0, v24
; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: vand.vx v24, v16, a3
-; RV64-NEXT: vsrl.vi v16, v16, 2
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: vand.vx v16, v16, a4
-; RV64-NEXT: vmul.vx v16, v16, a5
-; RV64-NEXT: vsrl.vx v16, v16, a6
+; RV64-NEXT: vand.vv v8, v24, v8
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v16, v8, v0
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vsrl.vx v16, v8, a2
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <32 x i1> poison, i1 true, i32 0
%m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 277146cc1403e9..33ac13ffc2cac8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -262,35 +262,26 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT: lui a1, 61681
+; RV32I-NEXT: addi a1, a1, -241
+; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v10, a1
; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
+; RV32I-NEXT: vsll.vi v11, v10, 2
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: vadd.vv v12, v11, v11
+; RV32I-NEXT: vxor.vv v12, v11, v12
+; RV32I-NEXT: vand.vv v9, v9, v12
; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v11
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v11
+; RV32I-NEXT: vadd.vv v8, v9, v8
; RV32I-NEXT: vsrl.vi v9, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v9, v10, 3
+; RV32I-NEXT: vand.vv v9, v10, v9
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -316,32 +307,29 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
; RV64I-NEXT: vor.vv v8, v8, v9
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v9, a1
+; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT: vmv.v.x v10, a1
+; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT: vsll.vi v11, v10, 2
+; RV64I-NEXT: vxor.vx v11, v11, a1
+; RV64I-NEXT: vadd.vv v12, v11, v11
+; RV64I-NEXT: vxor.vv v12, v11, v12
+; RV64I-NEXT: vand.vv v9, v9, v12
; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v8, a1
+; RV64I-NEXT: vand.vv v9, v8, v11
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vand.vv v8, v8, v11
; RV64I-NEXT: vadd.vv v8, v9, v8
; RV64I-NEXT: vsrl.vi v9, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v9
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v9, v10, 3
+; RV64I-NEXT: vand.vx v9, v9, a1
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
@@ -671,35 +659,26 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT: lui a1, 61681
+; RV32I-NEXT: addi a1, a1, -241
+; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v12, a1
; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
+; RV32I-NEXT: vsll.vi v14, v12, 2
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: vadd.vv v16, v14, v14
+; RV32I-NEXT: vxor.vv v16, v14, v16
+; RV32I-NEXT: vand.vv v10, v10, v16
; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v14
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v14
+; RV32I-NEXT: vadd.vv v8, v10, v8
; RV32I-NEXT: vsrl.vi v10, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v10, v12, 3
+; RV32I-NEXT: vand.vv v10, v12, v10
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -725,32 +704,29 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
; RV64I-NEXT: vor.vv v8, v8, v10
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v10, a1
+; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT: vmv.v.x v12, a1
+; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT: vsll.vi v14, v12, 2
+; RV64I-NEXT: vxor.vx v14, v14, a1
+; RV64I-NEXT: vadd.vv v16, v14, v14
+; RV64I-NEXT: vxor.vv v16, v14, v16
+; RV64I-NEXT: vand.vv v10, v10, v16
; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v8, a1
+; RV64I-NEXT: vand.vv v10, v8, v14
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vand.vv v8, v8, v14
; RV64I-NEXT: vadd.vv v8, v10, v8
; RV64I-NEXT: vsrl.vi v10, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v10
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v10, v12, 3
+; RV64I-NEXT: vand.vx v10, v10, a1
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
@@ -1061,35 +1037,26 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT: lui a1, 61681
+; RV32I-NEXT: addi a1, a1, -241
+; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v10, a1
; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
+; RV32I-NEXT: vsll.vi v11, v10, 2
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: vadd.vv v12, v11, v11
+; RV32I-NEXT: vxor.vv v12, v11, v12
+; RV32I-NEXT: vand.vv v9, v9, v12
; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v11
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v11
+; RV32I-NEXT: vadd.vv v8, v9, v8
; RV32I-NEXT: vsrl.vi v9, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v9, v10, 3
+; RV32I-NEXT: vand.vv v9, v10, v9
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -1115,32 +1082,29 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
; RV64I-NEXT: vor.vv v8, v8, v9
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v9, a1
+; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT: vmv.v.x v10, a1
+; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT: vsll.vi v11, v10, 2
+; RV64I-NEXT: vxor.vx v11, v11, a1
+; RV64I-NEXT: vadd.vv v12, v11, v11
+; RV64I-NEXT: vxor.vv v12, v11, v12
+; RV64I-NEXT: vand.vv v9, v9, v12
; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v8, a1
+; RV64I-NEXT: vand.vv v9, v8, v11
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vand.vv v8, v8, v11
; RV64I-NEXT: vadd.vv v8, v9, v8
; RV64I-NEXT: vsrl.vi v9, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v9
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v9, v10, 3
+; RV64I-NEXT: vand.vx v9, v9, a1
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
@@ -1446,35 +1410,26 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT: lui a1, 61681
+; RV32I-NEXT: addi a1, a1, -241
+; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v12, a1
; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
+; RV32I-NEXT: vsll.vi v14, v12, 2
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: vadd.vv v16, v14, v14
+; RV32I-NEXT: vxor.vv v16, v14, v16
+; RV32I-NEXT: vand.vv v10, v10, v16
; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v14
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v14
+; RV32I-NEXT: vadd.vv v8, v10, v8
; RV32I-NEXT: vsrl.vi v10, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v10, v12, 3
+; RV32I-NEXT: vand.vv v10, v12, v10
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -1500,32 +1455,29 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
; RV64I-NEXT: vor.vv v8, v8, v10
; RV64I-NEXT: vnot.v v8, v8
; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v10, a1
+; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT: vmv.v.x v12, a1
+; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT: vsll.vi v14, v12, 2
+; RV64I-NEXT: vxor.vx v14, v14, a1
+; RV64I-NEXT: vadd.vv v16, v14, v14
+; RV64I-NEXT: vxor.vv v16, v14, v16
+; RV64I-NEXT: vand.vv v10, v10, v16
; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v8, a1
+; RV64I-NEXT: vand.vv v10, v8, v14
; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vand.vv v8, v8, v14
; RV64I-NEXT: vadd.vv v8, v10, v8
; RV64I-NEXT: vsrl.vi v10, v8, 4
; RV64I-NEXT: vadd.vv v8, v8, v10
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v10, v12, 3
+; RV64I-NEXT: vand.vx v10, v10, a1
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index c4b22955f84c4f..7e1e94cbed3755 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -681,37 +681,27 @@ declare <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64>, <2 x i1>, i32)
define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT: vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT: vand.vv v11, v12, v11, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT: vand.vv v11, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -719,34 +709,30 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
;
; RV64-LABEL: vp_ctpop_v2i64:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT: vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT: vand.vv v11, v12, v11, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT: vand.vv v11, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v10, v0.t
+; RV64-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -757,37 +743,27 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -795,34 +771,30 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
;
; RV64-LABEL: vp_ctpop_v2i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a1
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -837,37 +809,27 @@ declare <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64>, <4 x i1>, i32)
define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v4i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vand.vv v14, v16, v14, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT: vand.vv v14, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -875,34 +837,30 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
;
; RV64-LABEL: vp_ctpop_v4i64:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT: vand.vv v14, v16, v14, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT: vand.vv v14, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -913,37 +871,27 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -951,34 +899,30 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
;
; RV64-LABEL: vp_ctpop_v4i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a1
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -993,37 +937,27 @@ declare <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64>, <8 x i1>, i32)
define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v20, v24, v20, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT: vand.vv v20, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1031,34 +965,30 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
;
; RV64-LABEL: vp_ctpop_v8i64:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT: vand.vv v20, v24, v20, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT: vand.vv v20, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v12, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1069,37 +999,27 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2
+; RV32-NEXT: vxor.vv v16, v12, v16
+; RV32-NEXT: vadd.vv v20, v16, v16
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v20, v24, v20
+; RV32-NEXT: vsub.vv v8, v8, v20
+; RV32-NEXT: vand.vv v20, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v20, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3
+; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1107,34 +1027,30 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
;
; RV64-LABEL: vp_ctpop_v8i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vsll.vi v16, v12, 2
+; RV64-NEXT: vxor.vx v16, v16, a1
+; RV64-NEXT: vadd.vv v20, v16, v16
+; RV64-NEXT: vxor.vv v20, v16, v20
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v20, v24, v20
+; RV64-NEXT: vsub.vv v8, v8, v20
+; RV64-NEXT: vand.vv v20, v8, v16
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v20, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v12, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1149,89 +1065,163 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32)
define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v15i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT: vand.vv v16, v24, v8, v0.t
+; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT: vand.vv v8, v24, v8, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
ret <15 x i64> %v
@@ -1240,91 +1230,100 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v15i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
- %head = insertelement <15 x i1> poison, i1 true, i32 0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %head = insertelement <15 x i1> poison, i1 true, i32 0
%m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
%v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
ret <15 x i64> %v
@@ -1335,89 +1334,163 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32)
define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v16i64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT: vand.vv v16, v24, v8, v0.t
+; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT: vand.vv v8, v24, v8, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl)
ret <16 x i64> %v
@@ -1426,89 +1499,98 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v16i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <16 x i1> poison, i1 true, i32 0
%m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -1521,117 +1603,151 @@ declare <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64>, <32 x i1>, i32)
define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: li a2, 16
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: vslidedown.vi v7, v0, 2
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a2, .LBB34_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB34_2:
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a2, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 24
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v24, v8, v16, v0.t
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT: vadd.vv v16, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: addi a2, a0, -16
; RV32-NEXT: sltu a0, a0, a2
@@ -1639,52 +1755,102 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v16, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
+; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v32i64:
@@ -1692,81 +1858,263 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: slli a1, a1, 5
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT: li a2, 16
+; RV64-NEXT: li a1, 16
; RV64-NEXT: vslidedown.vi v24, v0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB34_2
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: bltu a0, a1, .LBB34_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a1, 16
+; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB34_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a1, 349525
-; RV64-NEXT: addiw a1, a1, 1365
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vand.vx v16, v16, a1, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a2, 209715
-; RV64-NEXT: addiw a2, a2, 819
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v16, v8, a2, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a2, v0.t
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a3, a1, 32
+; RV64-NEXT: add a1, a1, a3
+; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a3, 61681
-; RV64-NEXT: addiw a3, a3, -241
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
-; RV64-NEXT: lui a4, 4112
-; RV64-NEXT: addiw a4, a4, 257
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT: li a5, 56
-; RV64-NEXT: vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT: addi a6, sp, 16
-; RV64-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV64-NEXT: vand.vx v8, v16, a1, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vmul.vv v16, v16, v8, v0.t
+; RV64-NEXT: li a2, 56
+; RV64-NEXT: vsrl.vx v16, v16, a2, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: addi a3, a0, -16
+; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
+; RV64-NEXT: and a0, a0, a3
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a1, v0.t
-; RV64-NEXT: vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v16, a2, v0.t
-; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t
; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -1777,190 +2125,223 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
+; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: li a2, 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a2, .LBB35_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a2, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v16, v24, v0
+; RV32-NEXT: vadd.vv v0, v16, v16
+; RV32-NEXT: vxor.vv v8, v16, v0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v0, v8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v0, v8
+; RV32-NEXT: vand.vv v0, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v0, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a2), zero
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v8
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a1, 56
-; RV32-NEXT: vsrl.vx v8, v16, a1
+; RV32-NEXT: vsrl.vx v8, v8, a1
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: addi a2, a0, -16
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v16, v24, 2
+; RV32-NEXT: vxor.vv v16, v24, v16
+; RV32-NEXT: vadd.vv v0, v16, v16
+; RV32-NEXT: vxor.vv v8, v16, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a0, a0, a2
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v16, v8, v16
-; RV32-NEXT: vand.vv v8, v16, v0
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v0, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v0, v8
+; RV32-NEXT: vand.vv v0, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v32i64_unmasked:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: li a2, 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: mv a1, a0
; RV64-NEXT: bltu a0, a2, .LBB35_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a1, 16
; RV64-NEXT: .LBB35_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: lui a1, 349525
-; RV64-NEXT: addiw a1, a1, 1365
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vand.vx v24, v24, a1
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: lui a2, 209715
-; RV64-NEXT: addiw a2, a2, 819
+; RV64-NEXT: lui a2, 61681
+; RV64-NEXT: addiw a2, a2, -241
; RV64-NEXT: slli a3, a2, 32
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v24, v8, a2
+; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a2
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsll.vi v0, v8, 2
+; RV64-NEXT: vxor.vx v0, v0, a2
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v8, v0, v24
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v24, v16, 1
+; RV64-NEXT: vand.vv v8, v24, v8
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v24, v8, v0
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a2
+; RV64-NEXT: vand.vv v8, v8, v0
; RV64-NEXT: vadd.vv v8, v24, v8
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: lui a3, 61681
-; RV64-NEXT: addiw a3, a3, -241
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: lui a4, 4112
-; RV64-NEXT: addiw a4, a4, 257
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vmul.vx v8, v8, a4
-; RV64-NEXT: li a5, 56
-; RV64-NEXT: vsrl.vx v8, v8, a5
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
+; RV64-NEXT: vand.vx v8, v8, a2
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v24, v16, 3
+; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: vmul.vv v8, v8, v24
+; RV64-NEXT: li a1, 56
+; RV64-NEXT: vsrl.vx v8, v8, a1
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: addi a3, a0, -16
+; RV64-NEXT: sltu a0, a0, a3
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
+; RV64-NEXT: and a0, a0, a3
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vand.vx v24, v24, a1
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: vand.vx v24, v16, a2
-; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a2
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v8, v24, v0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v0, v16, 1
+; RV64-NEXT: vand.vv v8, v0, v8
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v0, v8, v24
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vadd.vv v8, v0, v8
+; RV64-NEXT: vsrl.vi v24, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v24
+; RV64-NEXT: vand.vx v8, v8, a2
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
; RV64-NEXT: vand.vx v16, v16, a2
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vmul.vx v16, v16, a4
-; RV64-NEXT: vsrl.vx v16, v16, a5
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vsrl.vx v16, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <32 x i1> poison, i1 true, i32 0
%m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index b5114bbe491896..5cdcaf226d856f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -129,36 +129,27 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vand.vv v9, v10, v9
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1
@@ -169,33 +160,30 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a1, 349525
-; RV64-NEXT: addiw a1, a1, 1365
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vand.vx v9, v9, a1
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a1, 209715
-; RV64-NEXT: addiw a1, a1, 819
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vand.vx v9, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
; RV64-NEXT: lui a1, 61681
; RV64-NEXT: addiw a1, a1, -241
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a1
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: lui a1, 4112
-; RV64-NEXT: addiw a1, a1, 257
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vmul.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a1, 56
; RV64-NEXT: vsrl.vx v8, v8, a1
; RV64-NEXT: vse64.v v8, (a0)
@@ -435,36 +423,27 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vand.vv v10, v12, v10
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1
@@ -475,33 +454,30 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a1, 349525
-; RV64-NEXT: addiw a1, a1, 1365
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vand.vx v10, v10, a1
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a1, 209715
-; RV64-NEXT: addiw a1, a1, 819
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vand.vx v10, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
; RV64-NEXT: lui a1, 61681
; RV64-NEXT: addiw a1, a1, -241
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a1
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: lui a1, 4112
-; RV64-NEXT: addiw a1, a1, 257
-; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vmul.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a1, 56
; RV64-NEXT: vsrl.vx v8, v8, a1
; RV64-NEXT: vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 49f6ffd691292a..4a17552fc137ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -777,41 +777,31 @@ declare <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32)
define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT: vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v12, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT: vand.vv v11, v12, v11, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT: vand.vv v11, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -819,38 +809,34 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
;
; RV64-LABEL: vp_cttz_v2i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT: vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v12, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v9, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT: vand.vv v11, v12, v11, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT: vand.vv v11, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v10, v0.t
+; RV64-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -861,41 +847,31 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v12, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -903,38 +879,34 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
;
; RV64-LABEL: vp_cttz_v2i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a1
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a1
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v12, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -949,41 +921,31 @@ declare <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32)
define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v4i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vand.vv v14, v16, v14, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT: vand.vv v14, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -991,38 +953,34 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
;
; RV64-LABEL: vp_cttz_v4i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v10, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT: vand.vv v14, v16, v14, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT: vand.vv v14, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1033,41 +991,31 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v16, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1075,38 +1023,34 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
;
; RV64-LABEL: vp_cttz_v4i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a1
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a1
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v16, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1121,41 +1065,31 @@ declare <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32)
define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v24, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v20, v24, v20, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT: vand.vv v20, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -1163,38 +1097,34 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
;
; RV64-LABEL: vp_cttz_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v24, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v12, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
+; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT: vand.vv v20, v24, v20, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT: vand.vv v20, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v12, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -1205,41 +1135,31 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2
+; RV32-NEXT: vxor.vv v16, v12, v16
+; RV32-NEXT: vadd.vv v20, v16, v16
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v24, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v20, v24, v20
+; RV32-NEXT: vsub.vv v8, v8, v20
+; RV32-NEXT: vand.vv v20, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v20, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3
+; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -1247,38 +1167,34 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
;
; RV64-LABEL: vp_cttz_v8i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a1
+; RV64-NEXT: vsll.vi v16, v12, 2
+; RV64-NEXT: vxor.vx v16, v16, a1
+; RV64-NEXT: vadd.vv v20, v16, v16
+; RV64-NEXT: vxor.vv v20, v16, v20
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v24, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v20, v24, v20
+; RV64-NEXT: vsub.vv v8, v8, v20
+; RV64-NEXT: vand.vv v20, v8, v16
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v20, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v12, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -1293,196 +1209,290 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v15i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT: ret
- %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
- ret <15 x i64> %v
-}
-
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vv v16, v8, v24, v0.t
+; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
+; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: li a0, 56
+; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
+ ret <15 x i64> %v
+}
+
define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v0, v8, a0
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v15i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v0, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <15 x i1> poison, i1 true, i32 0
%m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -1495,97 +1505,182 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v16i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
+; RV64-NEXT: vand.vv v16, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
ret <16 x i64> %v
@@ -1594,97 +1689,106 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v0, v8, a0
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v16i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v0, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <16 x i1> poison, i1 true, i32 0
%m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -1697,155 +1801,139 @@ declare <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: li a2, 16
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a3, .LBB34_2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: bltu a0, a2, .LBB34_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a2, 16
+; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB34_2:
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vnot.v v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
@@ -1854,88 +1942,116 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vnot.v v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 56
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v32i64:
@@ -1943,14 +2059,21 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: li a1, 16
; RV64-NEXT: vslidedown.vi v24, v0, 2
@@ -1959,72 +2082,231 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB34_2:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a3, a1, 32
+; RV64-NEXT: add a1, a1, a3
+; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 5
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: li a2, 1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vnot.v v8, v16, v0.t
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT: addi a7, sp, 16
-; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV64-NEXT: vand.vx v8, v16, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vmul.vv v16, v16, v8, v0.t
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v16, v16, a3, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a0, a0, a4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a0, a0, a4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsub.vx v16, v8, a2, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vnot.v v16, v8, v0.t
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -2035,193 +2317,233 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT: vmv8r.v v24, v16
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: li a2, 16
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a2, .LBB35_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
-; RV32-NEXT: li a2, 1
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a2
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsub.vx v0, v8, a1
; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v8, v0, v0
+; RV32-NEXT: vxor.vv v8, v0, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 1
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v16, v8
; RV32-NEXT: vand.vv v16, v8, v0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v0
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: vsrl.vx v8, v8, a1
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v24, a2
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vand.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 1
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a1
+; RV32-NEXT: vnot.v v0, v8
+; RV32-NEXT: vand.vv v8, v0, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: vsrl.vx v16, v8, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v32i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a2, 16
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB35_2
-; RV64-NEXT: # %bb.1:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: li a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: bltu a0, a1, .LBB35_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB35_2:
-; RV64-NEXT: li a2, 1
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v24, v8, a2
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: lui a1, 349525
-; RV64-NEXT: addiw a1, a1, 1365
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
; RV64-NEXT: slli a3, a1, 32
; RV64-NEXT: add a1, a1, a3
-; RV64-NEXT: vand.vx v24, v24, a1
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT: li a2, 1
+; RV64-NEXT: vsub.vx v0, v8, a2
+; RV64-NEXT: vnot.v v8, v8
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vsll.vi v0, v16, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v16, v16, v24
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: vand.vv v16, v8, v0
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v0, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v8, v8, a3
+; RV64-NEXT: addi a4, sp, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v24, v16, a2
-; RV64-NEXT: vnot.v v16, v16
-; RV64-NEXT: vand.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vand.vx v24, v24, a1
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: vand.vx v24, v16, a3
-; RV64-NEXT: vsrl.vi v16, v16, 2
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: vand.vx v16, v16, a4
-; RV64-NEXT: vmul.vx v16, v16, a5
-; RV64-NEXT: vsrl.vx v16, v16, a6
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a2
+; RV64-NEXT: vnot.v v24, v8
+; RV64-NEXT: vand.vv v16, v24, v16
+; RV64-NEXT: vsll.vi v24, v0, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v8, v24, v0
+; RV64-NEXT: vsrl.vi v0, v16, 1
+; RV64-NEXT: vand.vv v8, v0, v8
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v16, v8, v24
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vsrl.vx v16, v8, a3
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <32 x i1> poison, i1 true, i32 0
%m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
@@ -2976,41 +3298,31 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v9, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT: vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v12, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT: vand.vv v11, v12, v11, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT: vand.vv v11, v8, v10, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v10, v0.t
+; RV32-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: vand.vv v8, v8, v9, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT: vand.vv v9, v9, v10, v0.t
; RV32-NEXT: vmul.vv v8, v8, v9, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -3018,38 +3330,34 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
;
; RV64-LABEL: vp_cttz_zero_undef_v2i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT: vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT: vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT: vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v12, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v9, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT: vand.vv v11, v12, v11, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT: vand.vv v11, v8, v10, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v10, v0.t
+; RV64-NEXT: vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT: vand.vx v9, v9, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v9, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -3060,41 +3368,31 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v9, v9, v10
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsll.vi v10, v9, 2
+; RV32-NEXT: vxor.vv v10, v9, v10
+; RV32-NEXT: vadd.vv v11, v10, v10
+; RV32-NEXT: vxor.vv v11, v10, v11
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v12, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vsrl.vi v12, v8, 1
+; RV32-NEXT: vand.vv v11, v12, v11
+; RV32-NEXT: vsub.vv v8, v8, v11
+; RV32-NEXT: vand.vv v11, v8, v10
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v11, v8
+; RV32-NEXT: vsrl.vi v10, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v9, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vi v10, v9, 3
+; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vmul.vv v8, v8, v9
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -3102,38 +3400,34 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
;
; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a1
; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a1
+; RV64-NEXT: vsll.vi v10, v9, 2
+; RV64-NEXT: vxor.vx v10, v10, a1
+; RV64-NEXT: vadd.vv v11, v10, v10
+; RV64-NEXT: vxor.vv v11, v10, v11
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v12, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vsrl.vi v12, v8, 1
+; RV64-NEXT: vand.vv v11, v12, v11
+; RV64-NEXT: vsub.vv v8, v8, v11
+; RV64-NEXT: vand.vv v11, v8, v10
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v11, v8
+; RV64-NEXT: vsrl.vi v10, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v9, v9, 3
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v8, v8, v9
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -3146,41 +3440,31 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v4i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v10, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT: vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vand.vv v14, v16, v14, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT: vand.vv v14, v8, v12, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v12, v0.t
+; RV32-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: vand.vv v8, v8, v10, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT: vand.vv v10, v10, v12, v0.t
; RV32-NEXT: vmul.vv v8, v8, v10, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -3188,38 +3472,34 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
;
; RV64-LABEL: vp_cttz_zero_undef_v4i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT: vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT: vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT: vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v10, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT: vand.vv v14, v16, v14, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT: vand.vv v14, v8, v12, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v12, v0.t
+; RV64-NEXT: vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT: vand.vx v10, v10, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v10, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -3230,41 +3510,31 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v10, v10, v12
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32-NEXT: vmv.v.x v10, a1
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsll.vi v12, v10, 2
+; RV32-NEXT: vxor.vv v12, v10, v12
+; RV32-NEXT: vadd.vv v14, v12, v12
+; RV32-NEXT: vxor.vv v14, v12, v14
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v16, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v14, v16, v14
+; RV32-NEXT: vsub.vv v8, v8, v14
+; RV32-NEXT: vand.vv v14, v8, v12
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v8, v14, v8
+; RV32-NEXT: vsrl.vi v12, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vi v12, v10, 3
+; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vmul.vv v8, v8, v10
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -3272,38 +3542,34 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
;
; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a1
+; RV64-NEXT: vsll.vi v12, v10, 2
+; RV64-NEXT: vxor.vx v12, v12, a1
+; RV64-NEXT: vadd.vv v14, v12, v12
+; RV64-NEXT: vxor.vv v14, v12, v14
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v16, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v14, v16, v14
+; RV64-NEXT: vsub.vv v8, v8, v14
+; RV64-NEXT: vand.vv v14, v8, v12
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v14, v8
+; RV64-NEXT: vsrl.vi v12, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v10, v10, 3
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v8, v8, v10
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -3316,41 +3582,31 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v12, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT: vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v24, v8, a0, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v20, v24, v20, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT: vand.vv v20, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vand.vv v8, v8, v12, v0.t
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT: vand.vv v12, v12, v16, v0.t
; RV32-NEXT: vmul.vv v8, v8, v12, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -3358,38 +3614,34 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
;
; RV64-LABEL: vp_cttz_zero_undef_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v24, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v12, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
+; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT: vand.vv v20, v24, v20, v0.t
+; RV64-NEXT: vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT: vand.vv v20, v8, v16, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT: vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT: vand.vx v12, v12, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v12, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: ret
@@ -3400,41 +3652,31 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a1
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v12, v12, v16
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma
; RV32-NEXT: vmv.v.x v12, a1
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsll.vi v16, v12, 2
+; RV32-NEXT: vxor.vv v16, v12, v16
+; RV32-NEXT: vadd.vv v20, v16, v16
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v24, v8, a0
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v20, v24, v20
+; RV32-NEXT: vsub.vv v8, v8, v20
+; RV32-NEXT: vand.vv v20, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v20, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.x v12, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vi v16, v12, 3
+; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vmul.vv v8, v8, v12
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
@@ -3442,38 +3684,34 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
;
; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a1
+; RV64-NEXT: vsll.vi v16, v12, 2
+; RV64-NEXT: vxor.vx v16, v16, a1
+; RV64-NEXT: vadd.vv v20, v16, v16
+; RV64-NEXT: vxor.vv v20, v16, v20
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v24, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vv v20, v24, v20
+; RV64-NEXT: vsub.vv v8, v8, v20
+; RV64-NEXT: vand.vv v20, v8, v16
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v20, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: vsrl.vi v12, v12, 3
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v8, v8, v12
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
@@ -3486,97 +3724,182 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v15i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
+; RV64-NEXT: vand.vv v16, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
ret <15 x i64> %v
@@ -3585,97 +3908,106 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v0, v8, a0
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v0, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <15 x i1> poison, i1 true, i32 0
%m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -3686,97 +4018,182 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT: vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a0, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v16i64:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT: vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a0, v0.t
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0, v0.t
+; RV64-NEXT: vand.vv v16, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0, v0.t
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
ret <16 x i64> %v
@@ -3785,97 +4202,106 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
+; RV32-NEXT: vsll.vi v24, v16, 2
+; RV32-NEXT: vxor.vv v24, v16, v24
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vx v0, v8, a0
; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vadd.vv v0, v24, v24
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v0
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v16, 3
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1
+; RV64-NEXT: vsll.vi v24, v16, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vsub.vx v0, v8, a0
; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v0, v24, v0
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vv v16, v16, v0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vand.vv v16, v8, v24
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a1, a0, 32
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: vmul.vx v8, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: li a0, 56
; RV64-NEXT: vsrl.vx v8, v8, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <16 x i1> poison, i1 true, i32 0
%m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -3886,155 +4312,139 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: li a2, 16
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a3, .LBB70_2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: bltu a0, a2, .LBB70_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a2, 16
+; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB70_2:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vnot.v v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
@@ -4043,88 +4453,116 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vnot.v v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 56
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v32i64:
@@ -4132,14 +4570,21 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: li a1, 16
; RV64-NEXT: vslidedown.vi v24, v0, 2
@@ -4148,72 +4593,231 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB70_2:
-; RV64-NEXT: li a1, 1
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
+; RV64-NEXT: slli a3, a1, 32
+; RV64-NEXT: add a1, a1, a3
+; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
+; RV64-NEXT: vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 5
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT: vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: li a2, 1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vnot.v v8, v16, v0.t
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vand.vv v8, v8, v16, v0.t
-; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: lui a2, 349525
-; RV64-NEXT: addiw a2, a2, 1365
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT: addi a7, sp, 16
-; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v8, 3, v0.t
+; RV64-NEXT: vand.vx v8, v16, a1, v0.t
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vmul.vv v16, v16, v8, v0.t
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v16, v16, a3, v0.t
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a4, 40
+; RV64-NEXT: mul a0, a0, a4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT: vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a4, 24
+; RV64-NEXT: mul a0, a0, a4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT: vnot.v v8, v8, v0.t
-; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsub.vx v16, v8, a2, v0.t
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vnot.v v16, v8, v0.t
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT: vand.vx v16, v16, a2, v0.t
-; RV64-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v16, v8, a3, v0.t
-; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v16, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vv v16, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v16, v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vv v8, v8, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vadd.vv v8, v16, v8, v0.t
; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT: vand.vx v8, v8, a4, v0.t
-; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vand.vx v8, v8, a1, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a1, v0.t
+; RV64-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 48
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -4224,193 +4828,233 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT: vmv8r.v v24, v16
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: li a2, 16
-; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a2, .LBB71_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB71_2:
-; RV32-NEXT: li a2, 1
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a2
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsub.vx v0, v8, a1
; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v8, v0, v0
+; RV32-NEXT: vxor.vv v8, v0, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v16, 1
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v16, v8
; RV32-NEXT: vand.vv v16, v8, v0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v0
; RV32-NEXT: vadd.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: vsrl.vx v8, v8, a1
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v24, a2
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vand.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 1
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vx v16, v8, a1
+; RV32-NEXT: vnot.v v0, v8
+; RV32-NEXT: vand.vv v8, v0, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vi v0, v24, 2
+; RV32-NEXT: vxor.vv v0, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v0
+; RV32-NEXT: vxor.vv v16, v0, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsrl.vi v16, v24, 3
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: vsrl.vx v16, v8, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
; RV64: # %bb.0:
-; RV64-NEXT: li a2, 16
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB71_2
-; RV64-NEXT: # %bb.1:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: li a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: bltu a0, a1, .LBB71_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a2, 16
; RV64-NEXT: .LBB71_2:
-; RV64-NEXT: li a2, 1
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v24, v8, a2
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: lui a1, 349525
-; RV64-NEXT: addiw a1, a1, 1365
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
; RV64-NEXT: slli a3, a1, 32
; RV64-NEXT: add a1, a1, a3
-; RV64-NEXT: vand.vx v24, v24, a1
-; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: lui a3, 209715
-; RV64-NEXT: addiw a3, a3, 819
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v16, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT: li a2, 1
+; RV64-NEXT: vsub.vx v0, v8, a2
+; RV64-NEXT: vnot.v v8, v8
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vsll.vi v0, v16, 2
+; RV64-NEXT: vxor.vx v0, v0, a1
+; RV64-NEXT: vadd.vv v24, v0, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vand.vv v16, v16, v24
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: vand.vv v16, v8, v0
; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: lui a4, 61681
-; RV64-NEXT: addiw a4, a4, -241
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: vand.vx v8, v8, a4
-; RV64-NEXT: lui a5, 4112
-; RV64-NEXT: addiw a5, a5, 257
-; RV64-NEXT: slli a6, a5, 32
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: vmul.vx v8, v8, a5
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: vsrl.vx v8, v8, a6
-; RV64-NEXT: addi a7, a0, -16
-; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v0, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: vsrl.vx v8, v8, a3
+; RV64-NEXT: addi a4, sp, 16
+; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a0, -16
+; RV64-NEXT: sltu a0, a0, a4
; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a7
+; RV64-NEXT: and a0, a0, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v24, v16, a2
-; RV64-NEXT: vnot.v v16, v16
-; RV64-NEXT: vand.vv v16, v16, v24
-; RV64-NEXT: vsrl.vi v24, v16, 1
-; RV64-NEXT: vand.vx v24, v24, a1
-; RV64-NEXT: vsub.vv v16, v16, v24
-; RV64-NEXT: vand.vx v24, v16, a3
-; RV64-NEXT: vsrl.vi v16, v16, 2
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vsrl.vi v24, v16, 4
-; RV64-NEXT: vadd.vv v16, v16, v24
-; RV64-NEXT: vand.vx v16, v16, a4
-; RV64-NEXT: vmul.vx v16, v16, a5
-; RV64-NEXT: vsrl.vx v16, v16, a6
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsub.vx v16, v8, a2
+; RV64-NEXT: vnot.v v24, v8
+; RV64-NEXT: vand.vv v16, v24, v16
+; RV64-NEXT: vsll.vi v24, v0, 2
+; RV64-NEXT: vxor.vx v24, v24, a1
+; RV64-NEXT: vadd.vv v0, v24, v24
+; RV64-NEXT: vxor.vv v8, v24, v0
+; RV64-NEXT: vsrl.vi v0, v16, 1
+; RV64-NEXT: vand.vv v8, v0, v8
+; RV64-NEXT: vsub.vv v8, v16, v8
+; RV64-NEXT: vand.vv v16, v8, v24
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vand.vx v8, v8, a1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsrl.vi v16, v16, 3
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vsrl.vx v16, v8, a3
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%head = insertelement <32 x i1> poison, i1 true, i32 0
%m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 8c8da6d1e00313..c778fab049005e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -250,40 +250,31 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32I-NEXT: vle64.v v8, (a0)
-; RV32I-NEXT: li a1, 1
-; RV32I-NEXT: vsub.vx v9, v8, a1
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
-; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
-; RV32I-NEXT: vsrl.vi v9, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v9
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v9, a1
; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT: vsll.vi v10, v9, 2
+; RV32I-NEXT: vxor.vv v10, v9, v10
+; RV32I-NEXT: vadd.vv v11, v10, v10
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: li a1, 1
+; RV32I-NEXT: vsub.vx v12, v8, a1
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vand.vv v11, v12, v11
+; RV32I-NEXT: vsub.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v11, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v11, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT: vsrl.vi v10, v9, 3
+; RV32I-NEXT: vand.vv v9, v9, v10
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -294,37 +285,34 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64I-NEXT: vle64.v v8, (a0)
-; RV64I-NEXT: li a1, 1
-; RV64I-NEXT: vsub.vx v9, v8, a1
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v9
-; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v9, a1
-; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v8, a1
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: vadd.vv v8, v9, v8
-; RV64I-NEXT: vsrl.vi v9, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v9
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT: vmv.v.x v9, a1
+; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT: vsll.vi v10, v9, 2
+; RV64I-NEXT: vxor.vx v10, v10, a1
+; RV64I-NEXT: vadd.vv v11, v10, v10
+; RV64I-NEXT: vxor.vv v11, v10, v11
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: vsub.vx v12, v8, a2
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vv v11, v12, v11
+; RV64I-NEXT: vsub.vv v8, v8, v11
+; RV64I-NEXT: vand.vv v11, v8, v10
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vadd.vv v8, v11, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v9, v9, 3
+; RV64I-NEXT: vand.vx v9, v9, a1
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
@@ -651,40 +639,31 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32I-NEXT: vle64.v v8, (a0)
-; RV32I-NEXT: li a1, 1
-; RV32I-NEXT: vsub.vx v10, v8, a1
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
-; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
-; RV32I-NEXT: vsrl.vi v10, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v10, a1
; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT: vsll.vi v12, v10, 2
+; RV32I-NEXT: vxor.vv v12, v10, v12
+; RV32I-NEXT: vadd.vv v14, v12, v12
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: li a1, 1
+; RV32I-NEXT: vsub.vx v16, v8, a1
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vand.vv v14, v16, v14
+; RV32I-NEXT: vsub.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v14, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v14, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT: vsrl.vi v12, v10, 3
+; RV32I-NEXT: vand.vv v10, v10, v12
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -695,37 +674,34 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64I-NEXT: vle64.v v8, (a0)
-; RV64I-NEXT: li a1, 1
-; RV64I-NEXT: vsub.vx v10, v8, a1
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v10
-; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v10, a1
-; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v8, a1
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: vadd.vv v8, v10, v8
-; RV64I-NEXT: vsrl.vi v10, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT: vmv.v.x v10, a1
+; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT: vsll.vi v12, v10, 2
+; RV64I-NEXT: vxor.vx v12, v12, a1
+; RV64I-NEXT: vadd.vv v14, v12, v12
+; RV64I-NEXT: vxor.vv v14, v12, v14
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: vsub.vx v16, v8, a2
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vv v14, v16, v14
+; RV64I-NEXT: vsub.vv v8, v8, v14
+; RV64I-NEXT: vand.vv v14, v8, v12
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vadd.vv v8, v14, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v10, v10, 3
+; RV64I-NEXT: vand.vx v10, v10, a1
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
@@ -1029,40 +1005,31 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32I-NEXT: vle64.v v8, (a0)
-; RV32I-NEXT: li a1, 1
-; RV32I-NEXT: vsub.vx v9, v8, a1
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
-; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
-; RV32I-NEXT: vsrl.vi v9, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v9
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v9, a1
; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT: vsll.vi v10, v9, 2
+; RV32I-NEXT: vxor.vv v10, v9, v10
+; RV32I-NEXT: vadd.vv v11, v10, v10
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: li a1, 1
+; RV32I-NEXT: vsub.vx v12, v8, a1
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vand.vv v11, v12, v11
+; RV32I-NEXT: vsub.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v11, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v11, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a1
-; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT: vsrl.vi v10, v9, 3
+; RV32I-NEXT: vand.vv v9, v9, v10
; RV32I-NEXT: vmul.vv v8, v8, v9
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -1073,37 +1040,34 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64I-NEXT: vle64.v v8, (a0)
-; RV64I-NEXT: li a1, 1
-; RV64I-NEXT: vsub.vx v9, v8, a1
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v9
-; RV64I-NEXT: vsrl.vi v9, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v9, a1
-; RV64I-NEXT: vsub.vv v8, v8, v9
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v9, v8, a1
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: vadd.vv v8, v9, v8
-; RV64I-NEXT: vsrl.vi v9, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v9
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT: vmv.v.x v9, a1
+; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT: vsll.vi v10, v9, 2
+; RV64I-NEXT: vxor.vx v10, v10, a1
+; RV64I-NEXT: vadd.vv v11, v10, v10
+; RV64I-NEXT: vxor.vv v11, v10, v11
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: vsub.vx v12, v8, a2
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vv v11, v12, v11
+; RV64I-NEXT: vsub.vv v8, v8, v11
+; RV64I-NEXT: vand.vv v11, v8, v10
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vadd.vv v8, v11, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v9, v9, 3
+; RV64I-NEXT: vand.vx v9, v9, a1
+; RV64I-NEXT: vmul.vv v8, v8, v9
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
@@ -1400,40 +1364,31 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32I-NEXT: vle64.v v8, (a0)
-; RV32I-NEXT: li a1, 1
-; RV32I-NEXT: vsub.vx v10, v8, a1
-; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a1, 349525
-; RV32I-NEXT: addi a1, a1, 1365
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
-; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi a1, a1, 819
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
-; RV32I-NEXT: vsrl.vi v10, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v10
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v10, a1
; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT: vsll.vi v12, v10, 2
+; RV32I-NEXT: vxor.vv v12, v10, v12
+; RV32I-NEXT: vadd.vv v14, v12, v12
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: li a1, 1
+; RV32I-NEXT: vsub.vx v16, v8, a1
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vand.vv v14, v16, v14
+; RV32I-NEXT: vsub.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v14, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v14, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a1
-; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT: vsrl.vi v12, v10, 3
+; RV32I-NEXT: vand.vv v10, v10, v12
; RV32I-NEXT: vmul.vv v8, v8, v10
; RV32I-NEXT: li a1, 56
; RV32I-NEXT: vsrl.vx v8, v8, a1
@@ -1444,37 +1399,34 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64I-NEXT: vle64.v v8, (a0)
-; RV64I-NEXT: li a1, 1
-; RV64I-NEXT: vsub.vx v10, v8, a1
-; RV64I-NEXT: vnot.v v8, v8
-; RV64I-NEXT: vand.vv v8, v8, v10
-; RV64I-NEXT: vsrl.vi v10, v8, 1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw a1, a1, 1365
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v10, a1
-; RV64I-NEXT: vsub.vv v8, v8, v10
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vand.vx v10, v8, a1
-; RV64I-NEXT: vsrl.vi v8, v8, 2
-; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: vadd.vv v8, v10, v8
-; RV64I-NEXT: vsrl.vi v10, v8, 4
-; RV64I-NEXT: vadd.vv v8, v8, v10
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
+; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT: vmv.v.x v10, a1
+; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT: vsll.vi v12, v10, 2
+; RV64I-NEXT: vxor.vx v12, v12, a1
+; RV64I-NEXT: vadd.vv v14, v12, v12
+; RV64I-NEXT: vxor.vv v14, v12, v14
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: vsub.vx v16, v8, a2
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vv v14, v16, v14
+; RV64I-NEXT: vsub.vv v8, v8, v14
+; RV64I-NEXT: vand.vv v14, v8, v12
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vadd.vv v8, v14, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
; RV64I-NEXT: vand.vx v8, v8, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: vsrl.vi v10, v10, 3
+; RV64I-NEXT: vand.vx v10, v10, a1
+; RV64I-NEXT: vmul.vv v8, v8, v10
; RV64I-NEXT: li a1, 56
; RV64I-NEXT: vsrl.vx v8, v8, a1
; RV64I-NEXT: vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index f707cb31e3eced..0b788878a601e5 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -176,30 +176,34 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: sraw a0, a0, a1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw s0, a1, 1365
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw s1, a1, 819
; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addi s2, a1, -241
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addi s3, a1, 257
+; RV64I-NEXT: addiw s0, a1, -241
+; RV64I-NEXT: slli a1, s0, 32
+; RV64I-NEXT: add s0, s0, a1
+; RV64I-NEXT: slli s1, s0, 2
+; RV64I-NEXT: xor s1, s1, s0
+; RV64I-NEXT: lui a1, 349525
+; RV64I-NEXT: addiw s2, a1, 1365
+; RV64I-NEXT: srli a1, s0, 3
+; RV64I-NEXT: and s3, a1, s0
; RV64I-NEXT: .LBB4_1: # %bb2
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: call bar
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: srli a0, a0, 1
-; RV64I-NEXT: and a0, a0, s0
-; RV64I-NEXT: sub a0, a1, a0
-; RV64I-NEXT: and a2, a0, s1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, s1
-; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: and a0, a0, s2
+; RV64I-NEXT: slli a2, a1, 32
+; RV64I-NEXT: srli a2, a2, 32
+; RV64I-NEXT: sub a2, a2, a0
+; RV64I-NEXT: and a0, a2, s1
+; RV64I-NEXT: srli a2, a2, 2
+; RV64I-NEXT: and a2, a2, s1
+; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: srli a2, a0, 4
; RV64I-NEXT: add a0, a0, a2
-; RV64I-NEXT: and a0, a0, s2
+; RV64I-NEXT: and a0, a0, s0
; RV64I-NEXT: mul a0, a0, s3
-; RV64I-NEXT: srliw a0, a0, 24
+; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: bnez a1, .LBB4_1
; RV64I-NEXT: # %bb.2: # %bb7
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@@ -323,27 +327,23 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: sraw a0, a0, a1
-; RV64I-NEXT: lui a1, 349525
-; RV64I-NEXT: addiw s0, a1, 1365
+; RV64I-NEXT: lui a1, 61681
+; RV64I-NEXT: addiw s0, a1, -241
; RV64I-NEXT: slli a1, s0, 32
; RV64I-NEXT: add s0, s0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw s1, a1, 819
-; RV64I-NEXT: slli a1, s1, 32
-; RV64I-NEXT: add s1, s1, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw s2, a1, -241
+; RV64I-NEXT: slli s1, s0, 2
+; RV64I-NEXT: xor s1, s1, s0
+; RV64I-NEXT: lui a1, 349525
+; RV64I-NEXT: addiw s2, a1, 1365
; RV64I-NEXT: slli a1, s2, 32
; RV64I-NEXT: add s2, s2, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw s3, a1, 257
-; RV64I-NEXT: slli a1, s3, 32
-; RV64I-NEXT: add s3, s3, a1
+; RV64I-NEXT: srli a1, s0, 3
+; RV64I-NEXT: and s3, a1, s0
; RV64I-NEXT: .LBB6_1: # %bb2
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: call foo
; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: and a1, a1, s0
+; RV64I-NEXT: and a1, a1, s2
; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: and a1, a0, s1
; RV64I-NEXT: srli a0, a0, 2
@@ -351,7 +351,7 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
; RV64I-NEXT: add a0, a1, a0
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: and a0, a0, s2
+; RV64I-NEXT: and a0, a0, s0
; RV64I-NEXT: mul a0, a0, s3
; RV64I-NEXT: srli a0, a0, 56
; RV64I-NEXT: bnez a0, .LBB6_1
More information about the llvm-commits
mailing list