[llvm] 0274be2 - [RISCV] Lower vector CTLZ_ZERO_UNDEF/CTTZ_ZERO_UNDEF by converting to FP and extracting the exponent.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 17 10:31:08 PST 2021
Author: Craig Topper
Date: 2021-11-17T10:29:41-08:00
New Revision: 0274be28d7f8266e441a45adc1a208fc0ca04dd4
URL: https://github.com/llvm/llvm-project/commit/0274be28d7f8266e441a45adc1a208fc0ca04dd4
DIFF: https://github.com/llvm/llvm-project/commit/0274be28d7f8266e441a45adc1a208fc0ca04dd4.diff
LOG: [RISCV] Lower vector CTLZ_ZERO_UNDEF/CTTZ_ZERO_UNDEF by converting to FP and extracting the exponent.
If we have a large enough floating point type that can exactly
represent the integer value, we can convert the value to FP and
use the exponent to calculate the leading/trailing zeros.
The exponent will contain log2 of the value plus the exponent bias.
We can then remove the bias and convert from log2 to leading/trailing
zeros.
This doesn't work for zero since the exponent of zero is zero so we
can only do this for CTLZ_ZERO_UNDEF/CTTZ_ZERO_UNDEF. If we need
a value for zero we can use a vmseq and a vmerge to handle it.
We need to be careful to make sure the floating point type is legal.
If it isn't we'll continue using the integer expansion. We could split the vector
and concatenate the results but that needs some additional work and evaluation.
Differential Revision: https://reviews.llvm.org/D111904
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f4e1fe25f5366..1c1712c63fd61 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7083,8 +7083,8 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
- return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
- DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
+ return DAG.getSelect(dl, VT, SrcIsZero,
+ DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
}
// Only expand vector types if we have the appropriate vector bit operations.
@@ -7132,8 +7132,8 @@ SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const {
SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
- return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
- DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
+ return DAG.getSelect(dl, VT, SrcIsZero,
+ DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
}
// Only expand vector types if we have the appropriate vector bit operations.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1bd7c24c9a176..ec1dc39bbe770 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -630,6 +630,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
}
+
+ // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
+ // type that can represent the value exactly.
+ if (VT.getVectorElementType() != MVT::i64) {
+ MVT FloatEltVT =
+ VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
+ EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
+ if (isTypeLegal(FloatVT)) {
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+ }
+ }
}
// Expand various CCs to best match the RVV ISA, which natively supports UNE
@@ -848,6 +860,19 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
for (unsigned VPOpc : IntegerVPOps)
setOperationAction(VPOpc, VT, Custom);
+
+ // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
+ // type that can represent the value exactly.
+ if (VT.getVectorElementType() != MVT::i64) {
+ MVT FloatEltVT =
+ VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
+ EVT FloatVT =
+ MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
+ if (isTypeLegal(FloatVT)) {
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+ }
+ }
}
for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
@@ -2323,6 +2348,57 @@ static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL);
}
+// Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
+// the exponent.
+static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned EltSize = VT.getScalarSizeInBits();
+ SDValue Src = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ // We need a FP type that can represent the value.
+ // TODO: Use f16 for i8 when possible?
+ MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32;
+ MVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
+
+ // Legal types should have been checked in the RISCVTargetLowering
+ // constructor.
+ // TODO: Splitting may make sense in some cases.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(FloatVT) &&
+ "Expected legal float type!");
+
+ // For CTTZ_ZERO_UNDEF, we need to extract the lowest set bit using X & -X.
+ // The trailing zero count is equal to log2 of this single bit value.
+ if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
+ SDValue Neg =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
+ Src = DAG.getNode(ISD::AND, DL, VT, Src, Neg);
+ }
+
+ // We have a legal FP type, convert to it.
+ SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+ // Bitcast to integer and shift the exponent to the LSB.
+ EVT IntVT = FloatVT.changeVectorElementTypeToInteger();
+ SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal);
+ unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23;
+ SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
+ DAG.getConstant(ShiftAmt, DL, IntVT));
+ // Truncate back to original type to allow vnsrl.
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift);
+ // The exponent contains log2 of the value in biased form.
+ unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127;
+
+ // For trailing zeros, we just need to subtract the bias.
+ if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF)
+ return DAG.getNode(ISD::SUB, DL, VT, Trunc,
+ DAG.getConstant(ExponentBias, DL, VT));
+
+ // For leading zeros, we need to remove the bias and convert from log2 to
+ // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)).
+ unsigned Adjust = ExponentBias + (EltSize - 1);
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc);
+}
+
// While RVV has alignment restrictions, we should always be able to load as a
// legal equivalently-sized byte-typed vector instead. This method is
// responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If
@@ -2941,6 +3017,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerToScalableOp(Op, DAG, RISCVISD::FMAXNUM_VL);
case ISD::ABS:
return lowerABS(Op, DAG);
+ case ISD::CTLZ_ZERO_UNDEF:
+ case ISD::CTTZ_ZERO_UNDEF:
+ return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
case ISD::VSELECT:
return lowerFixedLengthVectorSelectToRVV(Op, DAG);
case ISD::FCOPYSIGN:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index a2273c1348a21..425ed47fb5475 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1,147 +1,429 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32D
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64D
define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
-; CHECK-LABEL: ctlz_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: ctlz_nxv1i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv1i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv1i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v9, v9
+; RV32D-NEXT: vsrl.vi v9, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: vrsub.vx v8, v9, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv1i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v9, v9
+; RV64D-NEXT: vsrl.vi v9, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: vrsub.vx v8, v9, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 1 x i8> @llvm.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 false)
ret <vscale x 1 x i8> %a
}
declare <vscale x 1 x i8> @llvm.ctlz.nxv1i8(<vscale x 1 x i8>, i1)
define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
-; CHECK-LABEL: ctlz_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: ctlz_nxv2i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv2i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv2i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v9, v9
+; RV32D-NEXT: vsrl.vi v9, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: vrsub.vx v8, v9, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv2i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v9, v9
+; RV64D-NEXT: vsrl.vi v9, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: vrsub.vx v8, v9, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 2 x i8> @llvm.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 false)
ret <vscale x 2 x i8> %a
}
declare <vscale x 2 x i8> @llvm.ctlz.nxv2i8(<vscale x 2 x i8>, i1)
define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
-; CHECK-LABEL: ctlz_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: ctlz_nxv4i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv4i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv4i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32D-NEXT: vzext.vf4 v10, v8
+; RV32D-NEXT: vfcvt.f.xu.v v10, v10
+; RV32D-NEXT: vsrl.vi v10, v10, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v10, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: vrsub.vx v8, v9, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv4i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64D-NEXT: vzext.vf4 v10, v8
+; RV64D-NEXT: vfcvt.f.xu.v v10, v10
+; RV64D-NEXT: vsrl.vi v10, v10, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v10, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: vrsub.vx v8, v9, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 4 x i8> @llvm.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 false)
ret <vscale x 4 x i8> %a
}
declare <vscale x 4 x i8> @llvm.ctlz.nxv4i8(<vscale x 4 x i8>, i1)
define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
-; CHECK-LABEL: ctlz_nxv8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: ctlz_nxv8i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv8i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv8i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32D-NEXT: vzext.vf4 v12, v8
+; RV32D-NEXT: vfcvt.f.xu.v v12, v12
+; RV32D-NEXT: vsrl.vi v12, v12, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v12, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v10, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: vrsub.vx v8, v9, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv8i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64D-NEXT: vzext.vf4 v12, v8
+; RV64D-NEXT: vfcvt.f.xu.v v12, v12
+; RV64D-NEXT: vsrl.vi v12, v12, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v12, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v10, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: vrsub.vx v8, v9, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 false)
ret <vscale x 8 x i8> %a
}
declare <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8>, i1)
define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
-; CHECK-LABEL: ctlz_nxv16i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu
-; CHECK-NEXT: vsrl.vi v10, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v10
-; CHECK-NEXT: vsrl.vi v10, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v10
-; CHECK-NEXT: vsrl.vi v10, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v10
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v10, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v10, v10, a0
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v10, v8
-; CHECK-NEXT: vsrl.vi v10, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: ctlz_nxv16i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv16i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv16i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m8, ta, mu
+; RV32D-NEXT: vzext.vf4 v16, v8
+; RV32D-NEXT: vfcvt.f.xu.v v16, v16
+; RV32D-NEXT: vsrl.vi v16, v16, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v16, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v12, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: vrsub.vx v8, v10, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv16i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m8, ta, mu
+; RV64D-NEXT: vzext.vf4 v16, v8
+; RV64D-NEXT: vfcvt.f.xu.v v16, v16
+; RV64D-NEXT: vsrl.vi v16, v16, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v16, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v12, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: vrsub.vx v8, v10, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 16 x i8> @llvm.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 false)
ret <vscale x 16 x i8> %a
}
@@ -206,31 +488,536 @@ define <vscale x 64 x i8> @ctlz_nxv64i8(<vscale x 64 x i8> %va) {
declare <vscale x 64 x i8> @llvm.ctlz.nxv64i8(<vscale x 64 x i8>, i1)
define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
-; RV32-LABEL: ctlz_nxv1i16:
+; RV32I-LABEL: ctlz_nxv1i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv1i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv1i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vsrl.vi v9, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv1i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vsrl.vi v9, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v9, v9, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 false)
+ ret <vscale x 1 x i16> %a
+}
+declare <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16>, i1)
+
+define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
+; RV32I-LABEL: ctlz_nxv2i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv2i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv2i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vsrl.vi v9, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv2i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vsrl.vi v9, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v9, v9, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 false)
+ ret <vscale x 2 x i16> %a
+}
+declare <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16>, i1)
+
+define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
+; RV32I-LABEL: ctlz_nxv4i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv4i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv4i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vsrl.vi v10, v10, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v10, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv4i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vsrl.vi v10, v10, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v10, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v9, v9, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 false)
+ ret <vscale x 4 x i16> %a
+}
+declare <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16>, i1)
+
+define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
+; RV32I-LABEL: ctlz_nxv8i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv8i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv8i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vsrl.vi v12, v12, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v12, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v10, v10, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v10, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv8i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vsrl.vi v12, v12, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v12, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v10, v10, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v10, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 false)
+ ret <vscale x 8 x i16> %a
+}
+declare <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16>, i1)
+
+define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
+; RV32I-LABEL: ctlz_nxv16i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv16i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv16i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV32D-NEXT: vsrl.vi v16, v16, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v16, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v12, v12, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v12, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv16i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV64D-NEXT: vsrl.vi v16, v16, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v16, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v12, v12, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v12, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 false)
+ ret <vscale x 16 x i16> %a
+}
+declare <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16>, i1)
+
+define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
+; RV32-LABEL: ctlz_nxv32i16:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
+; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a0, 5
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: lui a0, 3
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: lui a0, 1
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vand.vx v8, v8, a0
@@ -239,31 +1026,31 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
; RV32-NEXT: vsrl.vi v8, v8, 8
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv1i16:
+; RV64-LABEL: ctlz_nxv32i16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
+; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 2
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 8
+; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
+; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: lui a0, 5
; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
+; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vsub.vv v8, v8, v16
; RV64-NEXT: lui a0, 3
; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vx v16, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
; RV64-NEXT: lui a0, 1
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
@@ -271,86 +1058,546 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
; RV64-NEXT: vmul.vx v8, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 8
; RV64-NEXT: ret
- %a = call <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 false)
- ret <vscale x 1 x i16> %a
+ %a = call <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 false)
+ ret <vscale x 32 x i16> %a
}
-declare <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16>, i1)
+declare <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16>, i1)
-define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
-; RV32-LABEL: ctlz_nxv2i16:
+define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
+; RV32I-LABEL: ctlz_nxv1i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv1i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv1i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32D-NEXT: vsrl.vx v9, v9, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv1i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64D-NEXT: vsrl.vx v9, v9, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v9, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v9, v9, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 false)
+ ret <vscale x 1 x i32> %a
+}
+declare <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32>, i1)
+
+define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
+; RV32I-LABEL: ctlz_nxv2i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv2i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv2i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV32D-NEXT: vsrl.vx v10, v10, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v10, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv2i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV64D-NEXT: vsrl.vx v10, v10, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v9, v10, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v9, v9, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i32> @llvm.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 false)
+ ret <vscale x 2 x i32> %a
+}
+declare <vscale x 2 x i32> @llvm.ctlz.nxv2i32(<vscale x 2 x i32>, i1)
+
+define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
+; RV32I-LABEL: ctlz_nxv4i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv4i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv4i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV32D-NEXT: vsrl.vx v12, v12, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v12, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v10, v10, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v10, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv4i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV64D-NEXT: vsrl.vx v12, v12, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v12, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v10, v10, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v10, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 false)
+ ret <vscale x 4 x i32> %a
+}
+declare <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32>, i1)
+
+define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
+; RV32I-LABEL: ctlz_nxv8i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv8i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_nxv8i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV32D-NEXT: vsrl.vx v16, v16, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v16, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v12, v12, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v12, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_nxv8i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64D-NEXT: vsrl.vx v16, v16, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v16, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v12, v12, a0
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v12, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 false)
+ ret <vscale x 8 x i32> %a
+}
+declare <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32>, i1)
+
+define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
+; RV32-LABEL: ctlz_nxv16i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
+; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, mu
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
+; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
+; RV32-NEXT: lui a0, 4112
+; RV32-NEXT: addi a0, a0, 257
; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
+; RV32-NEXT: vsrl.vi v8, v8, 24
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv2i16:
+; RV64-LABEL: ctlz_nxv16i32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
+; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, mu
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 2
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 8
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 16
+; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: lui a0, 349525
; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
+; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: lui a0, 209715
; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vx v16, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: lui a0, 61681
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
+; RV64-NEXT: lui a0, 4112
+; RV64-NEXT: addiw a0, a0, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
+; RV64-NEXT: vsrl.vi v8, v8, 24
; RV64-NEXT: ret
- %a = call <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 false)
- ret <vscale x 2 x i16> %a
+ %a = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
+ ret <vscale x 16 x i32> %a
}
-declare <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16>, i1)
+declare <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32>, i1)
-define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
-; RV32-LABEL: ctlz_nxv4i16:
+define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
+; RV32-LABEL: ctlz_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 4112
+; RV32-NEXT: addi a0, a0, 257
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; RV32-NEXT: vsrl.vi v9, v8, 1
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vsrl.vi v9, v8, 2
@@ -359,31 +1606,39 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vsrl.vi v9, v8, 8
; RV32-NEXT: vor.vv v8, v8, v9
+; RV32-NEXT: vsrl.vi v9, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v9
+; RV32-NEXT: addi a0, zero, 32
+; RV32-NEXT: vsrl.vx v9, v8, a0
+; RV32-NEXT: vor.vv v8, v8, v9
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v9, (a0), zero
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: vsrl.vi v11, v8, 1
+; RV32-NEXT: vand.vv v9, v11, v9
; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
+; RV32-NEXT: vand.vv v9, v8, v10
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
+; RV32-NEXT: vand.vv v8, v8, v10
; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v9, (a0), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: vsrl.vi v11, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v11
+; RV32-NEXT: vand.vv v8, v8, v9
+; RV32-NEXT: vmul.vv v8, v8, v10
+; RV32-NEXT: addi a0, zero, 56
+; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv4i16:
+; RV64-LABEL: ctlz_nxv1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; RV64-NEXT: vsrl.vi v9, v8, 1
; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vsrl.vi v9, v8, 2
@@ -392,36 +1647,83 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vsrl.vi v9, v8, 8
; RV64-NEXT: vor.vv v8, v8, v9
+; RV64-NEXT: vsrl.vi v9, v8, 16
+; RV64-NEXT: vor.vv v8, v8, v9
+; RV64-NEXT: addi a0, zero, 32
+; RV64-NEXT: vsrl.vx v9, v8, a0
+; RV64-NEXT: vor.vv v8, v8, v9
; RV64-NEXT: vxor.vi v8, v8, -1
; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
+; RV64-NEXT: lui a0, 21845
; RV64-NEXT: addiw a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
; RV64-NEXT: vand.vx v9, v9, a0
; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
+; RV64-NEXT: lui a0, 13107
; RV64-NEXT: addiw a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
; RV64-NEXT: vand.vx v9, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vadd.vv v8, v9, v8
; RV64-NEXT: vsrl.vi v9, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
+; RV64-NEXT: lui a0, 3855
+; RV64-NEXT: addiw a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
+; RV64-NEXT: lui a0, 4112
+; RV64-NEXT: addiw a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
+; RV64-NEXT: addi a0, zero, 56
+; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
- %a = call <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 false)
- ret <vscale x 4 x i16> %a
+ %a = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 false)
+ ret <vscale x 1 x i64> %a
}
-declare <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16>, i1)
+declare <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64>, i1)
-define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
-; RV32-LABEL: ctlz_nxv8i16:
+define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
+; RV32-LABEL: ctlz_nxv2i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 4112
+; RV32-NEXT: addi a0, a0, 257
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu
; RV32-NEXT: vsrl.vi v10, v8, 1
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vsrl.vi v10, v8, 2
@@ -430,31 +1732,39 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vsrl.vi v10, v8, 8
; RV32-NEXT: vor.vv v8, v8, v10
+; RV32-NEXT: vsrl.vi v10, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v10
+; RV32-NEXT: addi a0, zero, 32
+; RV32-NEXT: vsrl.vx v10, v8, a0
+; RV32-NEXT: vor.vv v8, v8, v10
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v10, (a0), zero
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: vsrl.vi v14, v8, 1
+; RV32-NEXT: vand.vv v10, v14, v10
; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
+; RV32-NEXT: vand.vv v10, v8, v12
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
+; RV32-NEXT: vand.vv v8, v8, v12
; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: vsrl.vi v14, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v14
+; RV32-NEXT: vand.vv v8, v8, v10
+; RV32-NEXT: vmul.vv v8, v8, v12
+; RV32-NEXT: addi a0, zero, 56
+; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv8i16:
+; RV64-LABEL: ctlz_nxv2i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu
; RV64-NEXT: vsrl.vi v10, v8, 1
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vsrl.vi v10, v8, 2
@@ -463,36 +1773,83 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vsrl.vi v10, v8, 8
; RV64-NEXT: vor.vv v8, v8, v10
+; RV64-NEXT: vsrl.vi v10, v8, 16
+; RV64-NEXT: vor.vv v8, v8, v10
+; RV64-NEXT: addi a0, zero, 32
+; RV64-NEXT: vsrl.vx v10, v8, a0
+; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vxor.vi v8, v8, -1
; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 5
+; RV64-NEXT: lui a0, 21845
; RV64-NEXT: addiw a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
; RV64-NEXT: vand.vx v10, v10, a0
; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 3
+; RV64-NEXT: lui a0, 13107
; RV64-NEXT: addiw a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
; RV64-NEXT: vand.vx v10, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vadd.vv v8, v10, v8
; RV64-NEXT: vsrl.vi v10, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
+; RV64-NEXT: lui a0, 3855
+; RV64-NEXT: addiw a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
+; RV64-NEXT: lui a0, 4112
+; RV64-NEXT: addiw a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
+; RV64-NEXT: addi a0, zero, 56
+; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
- %a = call <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 false)
- ret <vscale x 8 x i16> %a
+ %a = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 false)
+ ret <vscale x 2 x i64> %a
}
-declare <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16>, i1)
+declare <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64>, i1)
-define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
-; RV32-LABEL: ctlz_nxv16i16:
+define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
+; RV32-LABEL: ctlz_nxv4i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 4112
+; RV32-NEXT: addi a0, a0, 257
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu
; RV32-NEXT: vsrl.vi v12, v8, 1
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vsrl.vi v12, v8, 2
@@ -501,31 +1858,39 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vsrl.vi v12, v8, 8
; RV32-NEXT: vor.vv v8, v8, v12
+; RV32-NEXT: vsrl.vi v12, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v12
+; RV32-NEXT: addi a0, zero, 32
+; RV32-NEXT: vsrl.vx v12, v8, a0
+; RV32-NEXT: vor.vv v8, v8, v12
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v12, (a0), zero
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: vsrl.vi v20, v8, 1
+; RV32-NEXT: vand.vv v12, v20, v12
; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
+; RV32-NEXT: vand.vv v12, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: vsrl.vi v20, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v20
+; RV32-NEXT: vand.vv v8, v8, v12
+; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: addi a0, zero, 56
+; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv16i16:
+; RV64-LABEL: ctlz_nxv4i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu
; RV64-NEXT: vsrl.vi v12, v8, 1
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vsrl.vi v12, v8, 2
@@ -534,36 +1899,83 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vsrl.vi v12, v8, 8
; RV64-NEXT: vor.vv v8, v8, v12
+; RV64-NEXT: vsrl.vi v12, v8, 16
+; RV64-NEXT: vor.vv v8, v8, v12
+; RV64-NEXT: addi a0, zero, 32
+; RV64-NEXT: vsrl.vx v12, v8, a0
+; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vxor.vi v8, v8, -1
; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 5
+; RV64-NEXT: lui a0, 21845
; RV64-NEXT: addiw a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
; RV64-NEXT: vand.vx v12, v12, a0
; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 3
+; RV64-NEXT: lui a0, 13107
; RV64-NEXT: addiw a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
; RV64-NEXT: vand.vx v12, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vadd.vv v8, v12, v8
; RV64-NEXT: vsrl.vi v12, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
+; RV64-NEXT: lui a0, 3855
+; RV64-NEXT: addiw a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
+; RV64-NEXT: lui a0, 4112
+; RV64-NEXT: addiw a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
+; RV64-NEXT: addi a0, zero, 56
+; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
- %a = call <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 false)
- ret <vscale x 16 x i16> %a
+ %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
+ ret <vscale x 4 x i64> %a
}
-declare <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16>, i1)
+declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
-define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
-; RV32-LABEL: ctlz_nxv32i16:
+define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
+; RV32-LABEL: ctlz_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 4112
+; RV32-NEXT: addi a0, a0, 257
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 2
@@ -572,31 +1984,39 @@ define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 8
; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a0, zero, 32
+; RV32-NEXT: vsrl.vx v16, v8, a0
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v24, (a0), zero
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: vand.vv v16, v0, v16
; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vand.vv v16, v8, v24
; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v24, (a0), zero
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vmul.vv v8, v8, v24
+; RV32-NEXT: addi a0, zero, 56
+; RV32-NEXT: vsrl.vx v8, v8, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv32i16:
+; RV64-LABEL: ctlz_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu
+; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu
; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vsrl.vi v16, v8, 2
@@ -605,1837 +2025,1461 @@ define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vsrl.vi v16, v8, 8
; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 16
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: addi a0, zero, 32
+; RV64-NEXT: vsrl.vx v16, v8, a0
+; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vxor.vi v8, v8, -1
; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 5
+; RV64-NEXT: lui a0, 21845
; RV64-NEXT: addiw a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 1365
; RV64-NEXT: vand.vx v16, v16, a0
; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 3
+; RV64-NEXT: lui a0, 13107
; RV64-NEXT: addiw a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 819
; RV64-NEXT: vand.vx v16, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vadd.vv v8, v16, v8
; RV64-NEXT: vsrl.vi v16, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
+; RV64-NEXT: lui a0, 3855
+; RV64-NEXT: addiw a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, 241
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
+; RV64-NEXT: lui a0, 4112
+; RV64-NEXT: addiw a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: addi a0, a0, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
+; RV64-NEXT: addi a0, zero, 56
+; RV64-NEXT: vsrl.vx v8, v8, a0
; RV64-NEXT: ret
- %a = call <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 false)
- ret <vscale x 32 x i16> %a
+ %a = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 false)
+ ret <vscale x 8 x i64> %a
}
-declare <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16>, i1)
+declare <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64>, i1)
-define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
-; RV32-LABEL: ctlz_nxv1i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv1i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv1i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 false)
- ret <vscale x 1 x i32> %a
+; RV64I-LABEL: ctlz_zero_undef_nxv1i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv1i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v9
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv1i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v9
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i8> @llvm.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 true)
+ ret <vscale x 1 x i8> %a
}
-declare <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32>, i1)
-define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
-; RV32-LABEL: ctlz_nxv2i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv2i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv2i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 2 x i32> @llvm.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 false)
- ret <vscale x 2 x i32> %a
+; RV64I-LABEL: ctlz_zero_undef_nxv2i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv2i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v9
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv2i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v9
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i8> @llvm.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 true)
+ ret <vscale x 2 x i8> %a
}
-declare <vscale x 2 x i32> @llvm.ctlz.nxv2i32(<vscale x 2 x i32>, i1)
-define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
-; RV32-LABEL: ctlz_nxv4i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv4i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv4i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 false)
- ret <vscale x 4 x i32> %a
+; RV64I-LABEL: ctlz_zero_undef_nxv4i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv4i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32D-NEXT: vzext.vf4 v10, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v10
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v10, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv4i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64D-NEXT: vzext.vf4 v10, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v10
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v10, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i8> @llvm.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 true)
+ ret <vscale x 4 x i8> %a
}
-declare <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32>, i1)
-define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
-; RV32-LABEL: ctlz_nxv8i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, mu
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv8i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv8i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m4, ta, mu
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 false)
- ret <vscale x 8 x i32> %a
-}
-declare <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32>, i1)
-
-define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: ctlz_nxv16i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, mu
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_nxv16i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, mu
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
- ret <vscale x 16 x i32> %a
-}
-declare <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32>, i1)
-
-define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: ctlz_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: addi a0, zero, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 1
-; RV32-NEXT: vand.vv v9, v11, v9
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: vand.vv v9, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v11
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: addi a0, zero, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_nxv1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: addi a0, zero, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 21845
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 13107
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3855
-; RV64-NEXT: addiw a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
- %a = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 false)
- ret <vscale x 1 x i64> %a
-}
-declare <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64>, i1)
-
-define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: ctlz_nxv2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: addi a0, zero, 32
-; RV32-NEXT: vsrl.vx v10, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 1
-; RV32-NEXT: vand.vv v10, v14, v10
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vand.vv v10, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v14
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vmul.vv v8, v8, v12
-; RV32-NEXT: addi a0, zero, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_nxv2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: addi a0, zero, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 21845
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 13107
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 3855
-; RV64-NEXT: addiw a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
- %a = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 false)
- ret <vscale x 2 x i64> %a
-}
-declare <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64>, i1)
-
-define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: ctlz_nxv4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: addi a0, zero, 32
-; RV32-NEXT: vsrl.vx v12, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 1
-; RV32-NEXT: vand.vv v12, v20, v12
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: vand.vv v12, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v20
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: addi a0, zero, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_nxv4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: addi a0, zero, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 21845
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 13107
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 3855
-; RV64-NEXT: addiw a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
- ret <vscale x 4 x i64> %a
-}
-declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
-
-define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: ctlz_nxv8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a0, zero, 32
-; RV32-NEXT: vsrl.vx v16, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: vand.vv v16, v0, v16
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v8, v24
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmul.vv v8, v8, v24
-; RV32-NEXT: addi a0, zero, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_nxv8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: addi a0, zero, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 21845
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 13107
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 3855
-; RV64-NEXT: addiw a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, 241
-; RV64-NEXT: slli a0, a0, 12
-; RV64-NEXT: addi a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: addi a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
- %a = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 false)
- ret <vscale x 8 x i64> %a
-}
-declare <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64>, i1)
-
-define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 1 x i8> @llvm.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 true)
- ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 2 x i8> @llvm.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 true)
- ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 4 x i8> @llvm.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 true)
- ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, mu
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 true)
- ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv16i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu
-; CHECK-NEXT: vsrl.vi v10, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v10
-; CHECK-NEXT: vsrl.vi v10, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v10
-; CHECK-NEXT: vsrl.vi v10, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v10
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v10, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v10, v10, a0
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v10, v8
-; CHECK-NEXT: vsrl.vi v10, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 16 x i8> @llvm.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 true)
- ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, mu
-; CHECK-NEXT: vsrl.vi v12, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v12
-; CHECK-NEXT: vsrl.vi v12, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v12
-; CHECK-NEXT: vsrl.vi v12, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v12
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v12, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v12, v12, a0
-; CHECK-NEXT: vsub.vv v8, v8, v12
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v12, v8
-; CHECK-NEXT: vsrl.vi v12, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v12
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 32 x i8> @llvm.ctlz.nxv32i8(<vscale x 32 x i8> %va, i1 true)
- ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
-; CHECK-LABEL: ctlz_zero_undef_nxv64i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu
-; CHECK-NEXT: vsrl.vi v16, v8, 1
-; CHECK-NEXT: vor.vv v8, v8, v16
-; CHECK-NEXT: vsrl.vi v16, v8, 2
-; CHECK-NEXT: vor.vv v8, v8, v16
-; CHECK-NEXT: vsrl.vi v16, v8, 4
-; CHECK-NEXT: vor.vv v8, v8, v16
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vsrl.vi v16, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v16, v16, a0
-; CHECK-NEXT: vsub.vv v8, v8, v16
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v16, v8
-; CHECK-NEXT: vsrl.vi v16, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v16
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
- %a = call <vscale x 64 x i8> @llvm.ctlz.nxv64i8(<vscale x 64 x i8> %va, i1 true)
- ret <vscale x 64 x i8> %a
+; RV64I-LABEL: ctlz_zero_undef_nxv8i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv8i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32D-NEXT: vzext.vf4 v12, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v12
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v12, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv8i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64D-NEXT: vzext.vf4 v12, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v12
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v12, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 true)
+ ret <vscale x 8 x i8> %a
}
-define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv1i16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv16i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv1i16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 true)
- ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv2i16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv16i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv2i16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 true)
- ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv4i16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+; RV32D-LABEL: ctlz_zero_undef_nxv16i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m8, ta, mu
+; RV32D-NEXT: vzext.vf4 v16, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v16
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v16, 0
+; RV32D-NEXT: addi a0, zero, 134
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv4i16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 true)
- ret <vscale x 4 x i16> %a
+; RV64D-LABEL: ctlz_zero_undef_nxv16i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m8, ta, mu
+; RV64D-NEXT: vzext.vf4 v16, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v16
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v16, 0
+; RV64D-NEXT: addi a0, zero, 134
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 16 x i8> @llvm.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 true)
+ ret <vscale x 16 x i8> %a
}
-define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv8i16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_zero_undef_nxv8i16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 true)
- ret <vscale x 8 x i16> %a
+define <vscale x 32 x i8> @ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
+; CHECK-LABEL: ctlz_zero_undef_nxv32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, mu
+; CHECK-NEXT: vsrl.vi v12, v8, 1
+; CHECK-NEXT: vor.vv v8, v8, v12
+; CHECK-NEXT: vsrl.vi v12, v8, 2
+; CHECK-NEXT: vor.vv v8, v8, v12
+; CHECK-NEXT: vsrl.vi v12, v8, 4
+; CHECK-NEXT: vor.vv v8, v8, v12
+; CHECK-NEXT: vxor.vi v8, v8, -1
+; CHECK-NEXT: vsrl.vi v12, v8, 1
+; CHECK-NEXT: addi a0, zero, 85
+; CHECK-NEXT: vand.vx v12, v12, a0
+; CHECK-NEXT: vsub.vv v8, v8, v12
+; CHECK-NEXT: addi a0, zero, 51
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: vsrl.vi v8, v8, 2
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: vadd.vv v8, v12, v8
+; CHECK-NEXT: vsrl.vi v12, v8, 4
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: vand.vi v8, v8, 15
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x i8> @llvm.ctlz.nxv32i8(<vscale x 32 x i8> %va, i1 true)
+ ret <vscale x 32 x i8> %a
}
-define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv16i16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_zero_undef_nxv16i16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, mu
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 true)
- ret <vscale x 16 x i16> %a
+define <vscale x 64 x i8> @ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
+; CHECK-LABEL: ctlz_zero_undef_nxv64i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu
+; CHECK-NEXT: vsrl.vi v16, v8, 1
+; CHECK-NEXT: vor.vv v8, v8, v16
+; CHECK-NEXT: vsrl.vi v16, v8, 2
+; CHECK-NEXT: vor.vv v8, v8, v16
+; CHECK-NEXT: vsrl.vi v16, v8, 4
+; CHECK-NEXT: vor.vv v8, v8, v16
+; CHECK-NEXT: vxor.vi v8, v8, -1
+; CHECK-NEXT: vsrl.vi v16, v8, 1
+; CHECK-NEXT: addi a0, zero, 85
+; CHECK-NEXT: vand.vx v16, v16, a0
+; CHECK-NEXT: vsub.vv v8, v8, v16
+; CHECK-NEXT: addi a0, zero, 51
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: vsrl.vi v8, v8, 2
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: vadd.vv v8, v16, v8
+; CHECK-NEXT: vsrl.vi v16, v8, 4
+; CHECK-NEXT: vadd.vv v8, v8, v16
+; CHECK-NEXT: vand.vi v8, v8, 15
+; CHECK-NEXT: ret
+ %a = call <vscale x 64 x i8> @llvm.ctlz.nxv64i8(<vscale x 64 x i8> %va, i1 true)
+ ret <vscale x 64 x i8> %a
}
-define <vscale x 32 x i16> @ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv32i16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv1i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv32i16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 true)
- ret <vscale x 32 x i16> %a
+; RV64I-LABEL: ctlz_zero_undef_nxv1i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv1i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv1i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 true)
+ ret <vscale x 1 x i16> %a
}
-define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv1i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv2i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv1i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 true)
- ret <vscale x 1 x i32> %a
+; RV64I-LABEL: ctlz_zero_undef_nxv2i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv2i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv2i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 true)
+ ret <vscale x 2 x i16> %a
+}
+
+define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv4i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_zero_undef_nxv4i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv4i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v10, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v8, v10, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv4i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v10, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v8, v10, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 true)
+ ret <vscale x 4 x i16> %a
}
-define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv2i32:
+define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv8i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_zero_undef_nxv8i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv8i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v12, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v8, v12, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv8i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v12, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v8, v12, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 true)
+ ret <vscale x 8 x i16> %a
+}
+
+define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv16i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_zero_undef_nxv16i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv16i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v16, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: addi a0, zero, 142
+; RV32D-NEXT: vrsub.vx v8, v16, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv16i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v16, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: addi a0, zero, 142
+; RV64D-NEXT: vrsub.vx v8, v16, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 true)
+ ret <vscale x 16 x i16> %a
+}
+
+define <vscale x 32 x i16> @ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
+; RV32-LABEL: ctlz_zero_undef_nxv32i16:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
+; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a0, 5
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: lui a0, 3
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: lui a0, 1
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
+; RV32-NEXT: addi a0, zero, 257
; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
+; RV32-NEXT: vsrl.vi v8, v8, 8
; RV32-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv2i32:
+; RV64-LABEL: ctlz_zero_undef_nxv32i16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
+; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 2
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 8
+; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: lui a0, 5
; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
+; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: lui a0, 3
; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vx v16, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: lui a0, 1
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
+; RV64-NEXT: addi a0, zero, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
+; RV64-NEXT: vsrl.vi v8, v8, 8
; RV64-NEXT: ret
+ %a = call <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 true)
+ ret <vscale x 32 x i16> %a
+}
+
+define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv1i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_zero_undef_nxv1i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv1i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v9, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv1i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v9, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 true)
+ ret <vscale x 1 x i32> %a
+}
+
+define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
+; RV32I-LABEL: ctlz_zero_undef_nxv2i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_zero_undef_nxv2i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv2i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v10, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v8, v10, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv2i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v10, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v8, v10, a0
+; RV64D-NEXT: ret
%a = call <vscale x 2 x i32> @llvm.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 true)
ret <vscale x 2 x i32> %a
}
define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv4i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv4i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv4i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv4i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv4i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v12, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v8, v12, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv4i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v12, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v8, v12, a0
+; RV64D-NEXT: ret
%a = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 true)
ret <vscale x 4 x i32> %a
}
define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv8i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, mu
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv8i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv8i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m4, ta, mu
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv8i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: ctlz_zero_undef_nxv8i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v16, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: addi a0, zero, 1054
+; RV32D-NEXT: vrsub.vx v8, v16, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: ctlz_zero_undef_nxv8i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v16, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: addi a0, zero, 1054
+; RV64D-NEXT: vrsub.vx v8, v16, a0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 true)
ret <vscale x 8 x i32> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index 9d78e1d28131f..796863e98e506 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -1,132 +1,439 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32D
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64D
define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
-; CHECK-LABEL: cttz_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_nxv1i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv1i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv1i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v9
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv1i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v9
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 1 x i8> @llvm.cttz.nxv1i8(<vscale x 1 x i8> %va, i1 false)
ret <vscale x 1 x i8> %a
}
declare <vscale x 1 x i8> @llvm.cttz.nxv1i8(<vscale x 1 x i8>, i1)
define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
-; CHECK-LABEL: cttz_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_nxv2i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv2i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv2i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v9
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv2i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v9
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 2 x i8> @llvm.cttz.nxv2i8(<vscale x 2 x i8> %va, i1 false)
ret <vscale x 2 x i8> %a
}
declare <vscale x 2 x i8> @llvm.cttz.nxv2i8(<vscale x 2 x i8>, i1)
define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
-; CHECK-LABEL: cttz_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_nxv4i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv4i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv4i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vzext.vf4 v10, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v10
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v10, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv4i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vzext.vf4 v10, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v10
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v10, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 4 x i8> @llvm.cttz.nxv4i8(<vscale x 4 x i8> %va, i1 false)
ret <vscale x 4 x i8> %a
}
declare <vscale x 4 x i8> @llvm.cttz.nxv4i8(<vscale x 4 x i8>, i1)
define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
-; CHECK-LABEL: cttz_nxv8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_nxv8i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, m1, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv8i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, m1, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv8i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vzext.vf4 v12, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v12
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v12, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv8i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vzext.vf4 v12, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v12
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v12, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i8> @llvm.cttz.nxv8i8(<vscale x 8 x i8> %va, i1 false)
ret <vscale x 8 x i8> %a
}
declare <vscale x 8 x i8> @llvm.cttz.nxv8i8(<vscale x 8 x i8>, i1)
define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
-; CHECK-LABEL: cttz_nxv16i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu
-; CHECK-NEXT: vsub.vx v10, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v10
-; CHECK-NEXT: vsrl.vi v10, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v10, v10, a0
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v10, v8
-; CHECK-NEXT: vsrl.vi v10, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_nxv16i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, m2, ta, mu
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv16i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, m2, ta, mu
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv16i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV32D-NEXT: vmv.v.i v10, 0
+; RV32D-NEXT: vmseq.vv v0, v10, v8
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v10
+; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV32D-NEXT: vzext.vf4 v16, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v16
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v16, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv16i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV64D-NEXT: vmv.v.i v10, 0
+; RV64D-NEXT: vmseq.vv v0, v10, v8
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v10
+; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV64D-NEXT: vzext.vf4 v16, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v16
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v16, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: vmerge.vim v8, v8, 8, v0
+; RV64D-NEXT: ret
%a = call <vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> %va, i1 false)
ret <vscale x 16 x i8> %a
}
@@ -185,26 +492,511 @@ define <vscale x 64 x i8> @cttz_nxv64i8(<vscale x 64 x i8> %va) {
declare <vscale x 64 x i8> @llvm.cttz.nxv64i8(<vscale x 64 x i8>, i1)
define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
-; RV32-LABEL: cttz_nxv1i16:
+; RV32I-LABEL: cttz_nxv1i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv1i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv1i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv1i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i16> @llvm.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 false)
+ ret <vscale x 1 x i16> %a
+}
+declare <vscale x 1 x i16> @llvm.cttz.nxv1i16(<vscale x 1 x i16>, i1)
+
+define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
+; RV32I-LABEL: cttz_nxv2i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv2i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv2i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv2i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 false)
+ ret <vscale x 2 x i16> %a
+}
+declare <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16>, i1)
+
+define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
+; RV32I-LABEL: cttz_nxv4i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv4i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv4i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32D-NEXT: vmv.v.i v9, 0
+; RV32D-NEXT: vmseq.vv v0, v9, v8
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v10, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v10, a0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv4i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v10, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v10, a0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 false)
+ ret <vscale x 4 x i16> %a
+}
+declare <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16>, i1)
+
+define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
+; RV32I-LABEL: cttz_nxv8i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv8i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv8i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32D-NEXT: vmv.v.i v10, 0
+; RV32D-NEXT: vmseq.vv v0, v10, v8
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v10
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v12, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v12, a0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv8i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64D-NEXT: vmv.v.i v10, 0
+; RV64D-NEXT: vmseq.vv v0, v10, v8
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v10
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v12, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v12, a0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 false)
+ ret <vscale x 8 x i16> %a
+}
+declare <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16>, i1)
+
+define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
+; RV32I-LABEL: cttz_nxv16i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, m4, ta, mu
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv16i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, m4, ta, mu
+; RV64I-NEXT: vsub.vx v12, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv16i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32D-NEXT: vmv.v.i v12, 0
+; RV32D-NEXT: vmseq.vv v0, v12, v8
+; RV32D-NEXT: vrsub.vi v12, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v12
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v16, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v16, a0
+; RV32D-NEXT: addi a0, zero, 16
+; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv16i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64D-NEXT: vmv.v.i v12, 0
+; RV64D-NEXT: vmseq.vv v0, v12, v8
+; RV64D-NEXT: vrsub.vi v12, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v12
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v16, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v16, a0
+; RV64D-NEXT: addi a0, zero, 16
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 false)
+ ret <vscale x 16 x i16> %a
+}
+declare <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16>, i1)
+
+define <vscale x 32 x i16> @cttz_nxv32i16(<vscale x 32 x i16> %va) {
+; RV32-LABEL: cttz_nxv32i16:
; RV32: # %bb.0:
; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
+; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu
+; RV32-NEXT: vsub.vx v16, v8, a0
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a0, 5
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: lui a0, 3
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
; RV32-NEXT: lui a0, 1
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vand.vx v8, v8, a0
@@ -213,26 +1005,26 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
; RV32-NEXT: vsrl.vi v8, v8, 8
; RV32-NEXT: ret
;
-; RV64-LABEL: cttz_nxv1i16:
+; RV64-LABEL: cttz_nxv32i16:
; RV64: # %bb.0:
; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
+; RV64-NEXT: vsetvli a1, zero, e16, m8, ta, mu
+; RV64-NEXT: vsub.vx v16, v8, a0
; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 1
; RV64-NEXT: lui a0, 5
; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
+; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vsub.vv v8, v8, v16
; RV64-NEXT: lui a0, 3
; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vx v16, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
; RV64-NEXT: lui a0, 1
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
@@ -240,637 +1032,480 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
; RV64-NEXT: vmul.vx v8, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 8
; RV64-NEXT: ret
- %a = call <vscale x 1 x i16> @llvm.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 false)
- ret <vscale x 1 x i16> %a
+ %a = call <vscale x 32 x i16> @llvm.cttz.nxv32i16(<vscale x 32 x i16> %va, i1 false)
+ ret <vscale x 32 x i16> %a
}
-declare <vscale x 1 x i16> @llvm.cttz.nxv1i16(<vscale x 1 x i16>, i1)
+declare <vscale x 32 x i16> @llvm.cttz.nxv32i16(<vscale x 32 x i16>, i1)
-define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
-; RV32-LABEL: cttz_nxv2i16:
+define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
+; RV32I-LABEL: cttz_nxv1i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv1i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv1i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v9, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v9
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32D-NEXT: vsrl.vx v9, v10, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v9, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv1i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v9, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 false)
+ ret <vscale x 1 x i32> %a
+}
+declare <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32>, i1)
+
+define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
+; RV32I-LABEL: cttz_nxv2i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv2i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv2i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v9, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v9
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV32D-NEXT: vsrl.vx v10, v10, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v9, v10, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v9, v9, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv2i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64D-NEXT: vmv.v.i v9, 0
+; RV64D-NEXT: vmseq.vv v0, v9, v8
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v10, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v10, a0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i32> @llvm.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 false)
+ ret <vscale x 2 x i32> %a
+}
+declare <vscale x 2 x i32> @llvm.cttz.nxv2i32(<vscale x 2 x i32>, i1)
+
+define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
+; RV32I-LABEL: cttz_nxv4i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, mu
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv4i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m2, ta, mu
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv4i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v10, v8, v10
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v10
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV32D-NEXT: vsrl.vx v12, v12, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v12, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v10, v10, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v10, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv4i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64D-NEXT: vmv.v.i v10, 0
+; RV64D-NEXT: vmseq.vv v0, v10, v8
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v10
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v12, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v12, a0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 false)
+ ret <vscale x 4 x i32> %a
+}
+declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
+
+define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
+; RV32I-LABEL: cttz_nxv8i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_nxv8i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV64I-NEXT: vsub.vx v12, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv8i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32D-NEXT: vrsub.vi v12, v8, 0
+; RV32D-NEXT: vand.vv v12, v8, v12
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v12
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV32D-NEXT: vsrl.vx v16, v16, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v16, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v12, v12, a0
+; RV32D-NEXT: vmseq.vi v0, v8, 0
+; RV32D-NEXT: addi a0, zero, 32
+; RV32D-NEXT: vmerge.vxm v8, v12, a0, v0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv8i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64D-NEXT: vmv.v.i v12, 0
+; RV64D-NEXT: vmseq.vv v0, v12, v8
+; RV64D-NEXT: vrsub.vi v12, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v12
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v16, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v16, a0
+; RV64D-NEXT: addi a0, zero, 32
+; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0
+; RV64D-NEXT: ret
+ %a = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 false)
+ ret <vscale x 8 x i32> %a
+}
+declare <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32>, i1)
+
+define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
+; RV32-LABEL: cttz_nxv16i32:
; RV32: # %bb.0:
; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu
+; RV32-NEXT: vsub.vx v16, v8, a0
; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
+; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vsub.vv v8, v8, v16
+; RV32-NEXT: lui a0, 209715
; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
+; RV32-NEXT: vadd.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: lui a0, 61681
; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
+; RV32-NEXT: lui a0, 4112
+; RV32-NEXT: addi a0, a0, 257
; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
+; RV32-NEXT: vsrl.vi v8, v8, 24
; RV32-NEXT: ret
;
-; RV64-LABEL: cttz_nxv2i16:
+; RV64-LABEL: cttz_nxv16i32:
; RV64: # %bb.0:
; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
+; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu
+; RV64-NEXT: vsub.vx v16, v8, a0
; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vsrl.vi v16, v8, 1
+; RV64-NEXT: lui a0, 349525
; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
+; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vsub.vv v8, v8, v16
+; RV64-NEXT: lui a0, 209715
; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
+; RV64-NEXT: vand.vx v16, v8, a0
; RV64-NEXT: vsrl.vi v8, v8, 2
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
+; RV64-NEXT: vadd.vv v8, v16, v8
+; RV64-NEXT: vsrl.vi v16, v8, 4
+; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: lui a0, 61681
; RV64-NEXT: addiw a0, a0, -241
; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
+; RV64-NEXT: lui a0, 4112
+; RV64-NEXT: addiw a0, a0, 257
; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
+; RV64-NEXT: vsrl.vi v8, v8, 24
; RV64-NEXT: ret
- %a = call <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 false)
- ret <vscale x 2 x i16> %a
+ %a = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
+ ret <vscale x 16 x i32> %a
}
-declare <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16>, i1)
+declare <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32>, i1)
-define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
-; RV32-LABEL: cttz_nxv4i16:
+define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
+; RV32-LABEL: cttz_nxv1i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv4i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 false)
- ret <vscale x 4 x i16> %a
-}
-declare <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16>, i1)
-
-define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
-; RV32-LABEL: cttz_nxv8i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV32-NEXT: vsub.vx v10, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv8i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV64-NEXT: vsub.vx v10, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 false)
- ret <vscale x 8 x i16> %a
-}
-declare <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16>, i1)
-
-define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
-; RV32-LABEL: cttz_nxv16i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m4, ta, mu
-; RV32-NEXT: vsub.vx v12, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv16i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu
-; RV64-NEXT: vsub.vx v12, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 false)
- ret <vscale x 16 x i16> %a
-}
-declare <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16>, i1)
-
-define <vscale x 32 x i16> @cttz_nxv32i16(<vscale x 32 x i16> %va) {
-; RV32-LABEL: cttz_nxv32i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu
-; RV32-NEXT: vsub.vx v16, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv32i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m8, ta, mu
-; RV64-NEXT: vsub.vx v16, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 32 x i16> @llvm.cttz.nxv32i16(<vscale x 32 x i16> %va, i1 false)
- ret <vscale x 32 x i16> %a
-}
-declare <vscale x 32 x i16> @llvm.cttz.nxv32i16(<vscale x 32 x i16>, i1)
-
-define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
-; RV32-LABEL: cttz_nxv1i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv1i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 false)
- ret <vscale x 1 x i32> %a
-}
-declare <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32>, i1)
-
-define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
-; RV32-LABEL: cttz_nxv2i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv2i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 2 x i32> @llvm.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 false)
- ret <vscale x 2 x i32> %a
-}
-declare <vscale x 2 x i32> @llvm.cttz.nxv2i32(<vscale x 2 x i32>, i1)
-
-define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
-; RV32-LABEL: cttz_nxv4i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, mu
-; RV32-NEXT: vsub.vx v10, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv4i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, mu
-; RV64-NEXT: vsub.vx v10, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 false)
- ret <vscale x 4 x i32> %a
-}
-declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
-
-define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
-; RV32-LABEL: cttz_nxv8i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsub.vx v12, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv8i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV64-NEXT: vsub.vx v12, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 false)
- ret <vscale x 8 x i32> %a
-}
-declare <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32>, i1)
-
-define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: cttz_nxv16i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu
-; RV32-NEXT: vsub.vx v16, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_nxv16i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu
-; RV64-NEXT: vsub.vx v16, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
- %a = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
- ret <vscale x 16 x i32> %a
-}
-declare <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32>, i1)
-
-define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: cttz_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: lui a0, 349525
; RV32-NEXT: addi a0, a0, 1365
; RV32-NEXT: sw a0, 12(sp)
; RV32-NEXT: sw a0, 8(sp)
@@ -1290,126 +1925,401 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
declare <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64>, i1)
define <vscale x 1 x i8> @cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
-; CHECK-LABEL: cttz_zero_undef_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv1i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv1i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv1i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v9
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv1i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v9
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 1 x i8> @llvm.cttz.nxv1i8(<vscale x 1 x i8> %va, i1 true)
ret <vscale x 1 x i8> %a
}
define <vscale x 2 x i8> @cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
-; CHECK-LABEL: cttz_zero_undef_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv2i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv2i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv2i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vzext.vf4 v9, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v9
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv2i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vzext.vf4 v9, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v9
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 2 x i8> @llvm.cttz.nxv2i8(<vscale x 2 x i8> %va, i1 true)
ret <vscale x 2 x i8> %a
}
define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
-; CHECK-LABEL: cttz_zero_undef_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv4i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv4i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv4i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vzext.vf4 v10, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v10
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v10, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv4i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vzext.vf4 v10, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v10
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v10, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 4 x i8> @llvm.cttz.nxv4i8(<vscale x 4 x i8> %va, i1 true)
ret <vscale x 4 x i8> %a
}
define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
-; CHECK-LABEL: cttz_zero_undef_nxv8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu
-; CHECK-NEXT: vsub.vx v9, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v9, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v9, v9, a0
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vsrl.vi v9, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv8i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, m1, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv8i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, m1, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv8i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vzext.vf4 v12, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v12
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v12, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv8i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, m1, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vzext.vf4 v12, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v12
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v12, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i8> @llvm.cttz.nxv8i8(<vscale x 8 x i8> %va, i1 true)
ret <vscale x 8 x i8> %a
}
define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
-; CHECK-LABEL: cttz_zero_undef_nxv16i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 1
-; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu
-; CHECK-NEXT: vsub.vx v10, v8, a0
-; CHECK-NEXT: vxor.vi v8, v8, -1
-; CHECK-NEXT: vand.vv v8, v8, v10
-; CHECK-NEXT: vsrl.vi v10, v8, 1
-; CHECK-NEXT: addi a0, zero, 85
-; CHECK-NEXT: vand.vx v10, v10, a0
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: addi a0, zero, 51
-; CHECK-NEXT: vand.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 2
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vadd.vv v8, v10, v8
-; CHECK-NEXT: vsrl.vi v10, v8, 4
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: vand.vi v8, v8, 15
-; CHECK-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv16i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e8, m2, ta, mu
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: addi a0, zero, 85
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: addi a0, zero, 51
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: vand.vi v8, v8, 15
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv16i8:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e8, m2, ta, mu
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: addi a0, zero, 85
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: addi a0, zero, 51
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vi v8, v8, 15
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv16i8:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v10
+; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV32D-NEXT: vzext.vf4 v16, v8
+; RV32D-NEXT: vfcvt.f.xu.v v8, v16
+; RV32D-NEXT: vsrl.vi v8, v8, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v16, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv16i8:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v10
+; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV64D-NEXT: vzext.vf4 v16, v8
+; RV64D-NEXT: vfcvt.f.xu.v v8, v16
+; RV64D-NEXT: vsrl.vi v8, v8, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v16, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> %va, i1 true)
ret <vscale x 16 x i8> %a
}
@@ -1465,301 +2375,441 @@ define <vscale x 64 x i8> @cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
}
define <vscale x 1 x i16> @cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv1i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv1i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv1i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv1i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv1i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv1i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 1 x i16> @llvm.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 true)
ret <vscale x 1 x i16> %a
}
define <vscale x 2 x i16> @cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv2i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: cttz_zero_undef_nxv2i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 true)
- ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv4i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv2i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv4i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 true)
- ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv8i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV32-NEXT: vsub.vx v10, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv2i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv8i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV64-NEXT: vsub.vx v10, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
+; RV32D-LABEL: cttz_zero_undef_nxv2i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v9, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv2i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v9, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 true)
+ ret <vscale x 2 x i16> %a
+}
+
+define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
+; RV32I-LABEL: cttz_zero_undef_nxv4i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv4i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv4i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v10, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v10, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv4i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v10, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v10, a0
+; RV64D-NEXT: ret
+ %a = call <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 true)
+ ret <vscale x 4 x i16> %a
+}
+
+define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
+; RV32I-LABEL: cttz_zero_undef_nxv8i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: cttz_zero_undef_nxv8i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv8i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v10
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v12, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v12, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv8i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v10
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v12, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v12, a0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 true)
ret <vscale x 8 x i16> %a
}
define <vscale x 16 x i16> @cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv16i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e16, m4, ta, mu
-; RV32-NEXT: vsub.vx v12, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 5
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 3
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 1
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: addi a0, zero, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 8
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv16i16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e16, m4, ta, mu
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 5
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 3
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 1
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: addi a0, zero, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 8
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv16i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu
-; RV64-NEXT: vsub.vx v12, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 5
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 3
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 1
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: addi a0, zero, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 8
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv16i16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e16, m4, ta, mu
+; RV64I-NEXT: vsub.vx v12, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 5
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 3
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 1
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: addi a0, zero, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 8
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv16i16:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV32D-NEXT: vrsub.vi v12, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v12
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV32D-NEXT: vsrl.vi v8, v16, 23
+; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: addi a0, zero, 127
+; RV32D-NEXT: vsub.vx v8, v16, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv16i16:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu
+; RV64D-NEXT: vrsub.vi v12, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v12
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; RV64D-NEXT: vsrl.vi v8, v16, 23
+; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: addi a0, zero, 127
+; RV64D-NEXT: vsub.vx v8, v16, a0
+; RV64D-NEXT: ret
%a = call <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 true)
ret <vscale x 16 x i16> %a
}
@@ -1825,249 +2875,369 @@ define <vscale x 32 x i16> @cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
}
define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv1i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv1i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv1i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv1i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv1i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v9, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32D-NEXT: vnsrl.wi v8, v8, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v8, v8, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv1i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v9, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v9, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV64D-NEXT: vnsrl.wi v8, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v8, a0
+; RV64D-NEXT: ret
%a = call <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 true)
ret <vscale x 1 x i32> %a
}
define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv2i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, mu
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v9, v9, a0
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v9, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv2i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v9, v9, a0
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v9, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v9
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv2i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v9, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv2i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v9, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv2i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32D-NEXT: vrsub.vi v9, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v9
+; RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v10, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV32D-NEXT: vnsrl.wi v10, v8, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v8, v10, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv2i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v9
+; RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v10, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; RV64D-NEXT: vnsrl.wi v10, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v10, a0
+; RV64D-NEXT: ret
%a = call <vscale x 2 x i32> @llvm.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 true)
ret <vscale x 2 x i32> %a
}
define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv4i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, mu
-; RV32-NEXT: vsub.vx v10, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v10, v10, a0
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v10, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v10
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv4i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, mu
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v10, v10, a0
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v10, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v10
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv4i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, mu
-; RV64-NEXT: vsub.vx v10, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v10, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv4i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m2, ta, mu
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v10, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv4i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v10
+; RV32D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v12, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV32D-NEXT: vnsrl.wi v12, v8, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v8, v12, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv4i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v10
+; RV64D-NEXT: vfwcvt.f.xu.v v12, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v12, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; RV64D-NEXT: vnsrl.wi v12, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v12, a0
+; RV64D-NEXT: ret
%a = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 true)
ret <vscale x 4 x i32> %a
}
define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv8i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a0, zero, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsub.vx v12, v8, a0
-; RV32-NEXT: vxor.vi v8, v8, -1
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v12, v12, a0
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v12, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v12
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv8i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a0, zero, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vxor.vi v8, v8, -1
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v12, v12, a0
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v12, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v12
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv8i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a0, zero, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV64-NEXT: vsub.vx v12, v8, a0
-; RV64-NEXT: vxor.vi v8, v8, -1
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v12, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv8i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi a0, zero, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV64I-NEXT: vsub.vx v12, v8, a0
+; RV64I-NEXT: vxor.vi v8, v8, -1
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v12, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; RV32D-LABEL: cttz_zero_undef_nxv8i32:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV32D-NEXT: vrsub.vi v12, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v12
+; RV32D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV32D-NEXT: addi a0, zero, 52
+; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV32D-NEXT: vsrl.vx v8, v16, a0
+; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32D-NEXT: vnsrl.wi v16, v8, 0
+; RV32D-NEXT: addi a0, zero, 1023
+; RV32D-NEXT: vsub.vx v8, v16, a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_zero_undef_nxv8i32:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu
+; RV64D-NEXT: vrsub.vi v12, v8, 0
+; RV64D-NEXT: vand.vv v8, v8, v12
+; RV64D-NEXT: vfwcvt.f.xu.v v16, v8
+; RV64D-NEXT: addi a0, zero, 52
+; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64D-NEXT: vsrl.vx v8, v16, a0
+; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64D-NEXT: vnsrl.wi v16, v8, 0
+; RV64D-NEXT: addi a0, zero, 1023
+; RV64D-NEXT: vsub.vx v8, v16, a0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 true)
ret <vscale x 8 x i32> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index ece41795686bc..1e68636edd570 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -1,8 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32I
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64I
; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32
; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32D
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64D
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV64
define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind {
; LMULMAX2-RV32-LABEL: ctlz_v16i8:
@@ -108,6 +114,42 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15
; LMULMAX1-RV64-NEXT: vse8.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v16i8:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; LMULMAX8-RV32-NEXT: vzext.vf4 v12, v8
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v12, v12
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 134
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: vrsub.vx v8, v9, a1
+; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV32-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v16i8:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; LMULMAX8-RV64-NEXT: vzext.vf4 v12, v8
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v12, v12
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 134
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: vrsub.vx v8, v9, a1
+; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV64-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <16 x i8>, <16 x i8>* %x
%b = load <16 x i8>, <16 x i8>* %y
%c = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
@@ -117,75 +159,75 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind {
declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v8i16:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vle16.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 5
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 3
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 1
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: addi a1, zero, 257
-; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8
-; LMULMAX2-RV32-NEXT: vse16.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: ctlz_v8i16:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 5
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 3
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 1
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: addi a1, zero, 257
+; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8
+; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: ctlz_v8i16:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX2-RV64-NEXT: vle16.v v8, (a0)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: lui a1, 5
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 3
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 1
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: addi a1, zero, 257
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8
-; LMULMAX2-RV64-NEXT: vse16.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: ctlz_v8i16:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: lui a1, 5
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 3
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 1
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: addi a1, zero, 257
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8
+; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
; LMULMAX1-RV32-LABEL: ctlz_v8i16:
; LMULMAX1-RV32: # %bb.0:
@@ -256,6 +298,62 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8
; LMULMAX1-RV64-NEXT: vse16.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v8i16:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV32D-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 142
+; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 16
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v8i16:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV64D-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 142
+; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 16
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v8i16:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX8-RV32-NEXT: addi a1, zero, 142
+; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 16
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV32-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v8i16:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX8-RV64-NEXT: addi a1, zero, 142
+; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 16
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV64-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <8 x i16>, <8 x i16>* %x
%b = load <8 x i16>, <8 x i16>* %y
%c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
@@ -265,81 +363,81 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind {
declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v4i32:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: ctlz_v4i32:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 16
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: ctlz_v4i32:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV64-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: lui a1, 349525
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 209715
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 61681
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: lui a1, 4112
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 257
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV64-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: ctlz_v4i32:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 16
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: lui a1, 349525
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 209715
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 61681
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: lui a1, 4112
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
; LMULMAX1-RV32-LABEL: ctlz_v4i32:
; LMULMAX1-RV32: # %bb.0:
@@ -416,6 +514,66 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24
; LMULMAX1-RV64-NEXT: vse32.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v4i32:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 52
+; LMULMAX2-RV32D-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 1054
+; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 32
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v4i32:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 52
+; LMULMAX2-RV64D-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 1054
+; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 32
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v4i32:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX8-RV32-NEXT: addi a1, zero, 52
+; LMULMAX8-RV32-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX8-RV32-NEXT: addi a1, zero, 1054
+; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV32-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v4i32:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX8-RV64-NEXT: addi a1, zero, 52
+; LMULMAX8-RV64-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 1054
+; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV64-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <4 x i32>, <4 x i32>* %x
%b = load <4 x i32>, <4 x i32>* %y
%c = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
@@ -666,6 +824,127 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1
; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v2i64:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 16
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vsrl.vx v9, v8, a1
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.i v9, -1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX8-RV32-NEXT: lui a1, 349525
+; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10
+; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: lui a1, 209715
+; LMULMAX8-RV32-NEXT: addi a1, a1, 819
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: lui a1, 61681
+; LMULMAX8-RV32-NEXT: addi a1, a1, -241
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: lui a1, 4112
+; LMULMAX8-RV32-NEXT: addi a1, a1, 257
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: addi a1, zero, 56
+; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v2i64:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 16
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vsrl.vx v9, v8, a1
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX8-RV64-NEXT: lui a1, 21845
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: lui a1, 13107
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a1
+; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: lui a1, 3855
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: lui a1, 4112
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 56
+; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <2 x i64>, <2 x i64>* %x
%b = load <2 x i64>, <2 x i64>* %y
%c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
@@ -820,6 +1099,44 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse8.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse8.v v8, (a1)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v32i8:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; LMULMAX8-RV32-NEXT: vzext.vf4 v16, v8
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v16, v16
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v12, v16, 23
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 134
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: vrsub.vx v8, v10, a1
+; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV32-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v32i8:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; LMULMAX8-RV64-NEXT: vzext.vf4 v16, v8
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v16, v16
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v12, v16, 23
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 134
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: vrsub.vx v8, v10, a1
+; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV64-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %x
%b = load <32 x i8>, <32 x i8>* %y
%c = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
@@ -1016,6 +1333,34 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse16.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse16.v v8, (a1)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v16i16:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v8
+; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV32-NEXT: addi a1, zero, 142
+; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 16
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV32-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v16i16:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v8
+; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV64-NEXT: addi a1, zero, 142
+; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 16
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV64-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %x
%b = load <16 x i16>, <16 x i16>* %y
%c = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
@@ -1228,6 +1573,36 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse32.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse32.v v8, (a1)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v8i32:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v8
+; LMULMAX8-RV32-NEXT: addi a1, zero, 52
+; LMULMAX8-RV32-NEXT: vnsrl.wx v10, v12, a1
+; LMULMAX8-RV32-NEXT: addi a1, zero, 1054
+; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV32-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v8i32:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v8
+; LMULMAX8-RV64-NEXT: addi a1, zero, 52
+; LMULMAX8-RV64-NEXT: vnsrl.wx v10, v12, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 1054
+; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV64-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %x
%b = load <8 x i32>, <8 x i32>* %y
%c = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
@@ -1534,6 +1909,127 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse64.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse64.v v8, (a7)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: ctlz_v4i64:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 2
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 8
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 16
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vsrl.vx v10, v8, a1
+; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX8-RV32-NEXT: lui a1, 349525
+; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12
+; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: lui a1, 209715
+; LMULMAX8-RV32-NEXT: addi a1, a1, 819
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: lui a1, 61681
+; LMULMAX8-RV32-NEXT: addi a1, a1, -241
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: lui a1, 4112
+; LMULMAX8-RV32-NEXT: addi a1, a1, 257
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: addi a1, zero, 56
+; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: ctlz_v4i64:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 2
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 8
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 16
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vsrl.vx v10, v8, a1
+; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX8-RV64-NEXT: lui a1, 21845
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: lui a1, 13107
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a1
+; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: lui a1, 3855
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: lui a1, 4112
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 56
+; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %x
%b = load <4 x i64>, <4 x i64>* %y
%c = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index c235430fb3cbd..3fcf539bd58c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -1,8 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32I
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64I
; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32
; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32D
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64D
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV64
define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind {
; LMULMAX2-RV32-LABEL: cttz_v16i8:
@@ -96,6 +102,46 @@ define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15
; LMULMAX1-RV64-NEXT: vse8.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v16i8:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; LMULMAX8-RV32-NEXT: vzext.vf4 v12, v9
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v12, v12
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 127
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: vsub.vx v8, v9, a1
+; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV32-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v16i8:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; LMULMAX8-RV64-NEXT: vzext.vf4 v12, v9
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v12, v12
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 127
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: vsub.vx v8, v9, a1
+; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV64-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <16 x i8>, <16 x i8>* %x
%b = load <16 x i8>, <16 x i8>* %y
%c = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
@@ -105,65 +151,65 @@ define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind {
declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v8i16:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vle16.v v8, (a0)
-; LMULMAX2-RV32-NEXT: addi a1, zero, 1
-; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 5
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 3
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 1
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: addi a1, zero, 257
-; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8
-; LMULMAX2-RV32-NEXT: vse16.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: cttz_v8i16:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: addi a1, zero, 1
+; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 5
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 3
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 1
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: addi a1, zero, 257
+; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8
+; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: cttz_v8i16:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX2-RV64-NEXT: vle16.v v8, (a0)
-; LMULMAX2-RV64-NEXT: addi a1, zero, 1
-; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: lui a1, 5
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 3
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 1
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: addi a1, zero, 257
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8
-; LMULMAX2-RV64-NEXT: vse16.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: cttz_v8i16:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: addi a1, zero, 1
+; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: lui a1, 5
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 3
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 1
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: addi a1, zero, 257
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8
+; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
; LMULMAX1-RV32-LABEL: cttz_v8i16:
; LMULMAX1-RV32: # %bb.0:
@@ -224,6 +270,70 @@ define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8
; LMULMAX1-RV64-NEXT: vse16.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v8i16:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV32D-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 127
+; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 16
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v8i16:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV64D-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 127
+; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 16
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v8i16:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX8-RV32-NEXT: addi a1, zero, 127
+; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 16
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV32-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v8i16:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX8-RV64-NEXT: addi a1, zero, 127
+; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 16
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV64-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <8 x i16>, <8 x i16>* %x
%b = load <8 x i16>, <8 x i16>* %y
%c = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
@@ -233,67 +343,67 @@ define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind {
declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v4i32:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: addi a1, zero, 1
-; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: cttz_v4i32:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: addi a1, zero, 1
+; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: cttz_v4i32:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-RV64-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: addi a1, zero, 1
-; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1
-; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: lui a1, 349525
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 209715
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, 61681
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: lui a1, 4112
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 257
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV64-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: cttz_v4i32:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: addi a1, zero, 1
+; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1
+; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: lui a1, 349525
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 209715
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, 61681
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: lui a1, 4112
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
; LMULMAX1-RV32-LABEL: cttz_v4i32:
; LMULMAX1-RV32: # %bb.0:
@@ -356,6 +466,74 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24
; LMULMAX1-RV64-NEXT: vse32.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v4i32:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 52
+; LMULMAX2-RV32D-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 1023
+; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT: addi a1, zero, 32
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v4i32:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 52
+; LMULMAX2-RV64D-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 1023
+; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: addi a1, zero, 32
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v4i32:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX8-RV32-NEXT: addi a1, zero, 52
+; LMULMAX8-RV32-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX8-RV32-NEXT: addi a1, zero, 1023
+; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV32-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v4i32:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX8-RV64-NEXT: addi a1, zero, 52
+; LMULMAX8-RV64-NEXT: vnsrl.wx v9, v10, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 1023
+; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-RV64-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <4 x i32>, <4 x i32>* %x
%b = load <4 x i32>, <4 x i32>* %y
%c = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
@@ -566,6 +744,107 @@ define void @cttz_v2i64(<2 x i64>* %x, <2 x i64>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1
; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v2i64:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: addi a1, zero, 1
+; LMULMAX8-RV32-NEXT: vsub.vx v9, v8, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX8-RV32-NEXT: lui a1, 349525
+; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10
+; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: lui a1, 209715
+; LMULMAX8-RV32-NEXT: addi a1, a1, 819
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9
+; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8
+; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: lui a1, 61681
+; LMULMAX8-RV32-NEXT: addi a1, a1, -241
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: lui a1, 4112
+; LMULMAX8-RV32-NEXT: addi a1, a1, 257
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9
+; LMULMAX8-RV32-NEXT: addi a1, zero, 56
+; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v2i64:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: addi a1, zero, 1
+; LMULMAX8-RV64-NEXT: vsub.vx v9, v8, a1
+; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1
+; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX8-RV64-NEXT: lui a1, 21845
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: lui a1, 13107
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a1
+; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8
+; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9
+; LMULMAX8-RV64-NEXT: lui a1, 3855
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: lui a1, 4112
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 56
+; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <2 x i64>, <2 x i64>* %x
%b = load <2 x i64>, <2 x i64>* %y
%c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
@@ -700,6 +979,48 @@ define void @cttz_v32i8(<32 x i8>* %x, <32 x i8>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse8.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse8.v v8, (a1)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v32i8:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; LMULMAX8-RV32-NEXT: vzext.vf4 v16, v10
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v16, v16
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v12, v16, 23
+; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 127
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: vsub.vx v8, v10, a1
+; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV32-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v32i8:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; LMULMAX8-RV64-NEXT: vzext.vf4 v16, v10
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v16, v16
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v12, v16, 23
+; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 127
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: vsub.vx v8, v10, a1
+; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0
+; LMULMAX8-RV64-NEXT: vse8.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %x
%b = load <32 x i8>, <32 x i8>* %y
%c = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false)
@@ -864,6 +1185,38 @@ define void @cttz_v16i16(<16 x i16>* %x, <16 x i16>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse16.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse16.v v8, (a1)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v16i16:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v10
+; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV32-NEXT: addi a1, zero, 127
+; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 16
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV32-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v16i16:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v10
+; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23
+; LMULMAX8-RV64-NEXT: addi a1, zero, 127
+; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 16
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV64-NEXT: vse16.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %x
%b = load <16 x i16>, <16 x i16>* %y
%c = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false)
@@ -1032,6 +1385,40 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse32.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse32.v v8, (a1)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v8i32:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v10
+; LMULMAX8-RV32-NEXT: addi a1, zero, 52
+; LMULMAX8-RV32-NEXT: vnsrl.wx v10, v12, a1
+; LMULMAX8-RV32-NEXT: addi a1, zero, 1023
+; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV32-NEXT: addi a1, zero, 32
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV32-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v8i32:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v10
+; LMULMAX8-RV64-NEXT: addi a1, zero, 52
+; LMULMAX8-RV64-NEXT: vnsrl.wx v10, v12, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 1023
+; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: addi a1, zero, 32
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-RV64-NEXT: vse32.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %x
%b = load <8 x i32>, <8 x i32>* %y
%c = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false)
@@ -1278,6 +1665,107 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) nounwind {
; LMULMAX1-RV64-NEXT: vse64.v v9, (a0)
; LMULMAX1-RV64-NEXT: vse64.v v8, (a7)
; LMULMAX1-RV64-NEXT: ret
+;
+; LMULMAX8-RV32-LABEL: cttz_v4i64:
+; LMULMAX8-RV32: # %bb.0:
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: addi a1, zero, 1
+; LMULMAX8-RV32-NEXT: vsub.vx v10, v8, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.i v12, -1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v12
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX8-RV32-NEXT: lui a1, 349525
+; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12
+; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: lui a1, 209715
+; LMULMAX8-RV32-NEXT: addi a1, a1, 819
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10
+; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8
+; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: lui a1, 61681
+; LMULMAX8-RV32-NEXT: addi a1, a1, -241
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: lui a1, 4112
+; LMULMAX8-RV32-NEXT: addi a1, a1, 257
+; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10
+; LMULMAX8-RV32-NEXT: addi a1, zero, 56
+; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV32-NEXT: ret
+;
+; LMULMAX8-RV64-LABEL: cttz_v4i64:
+; LMULMAX8-RV64: # %bb.0:
+; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: addi a1, zero, 1
+; LMULMAX8-RV64-NEXT: vsub.vx v10, v8, a1
+; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1
+; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX8-RV64-NEXT: lui a1, 21845
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: lui a1, 13107
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 819
+; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a1
+; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8
+; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10
+; LMULMAX8-RV64-NEXT: lui a1, 3855
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, 241
+; LMULMAX8-RV64-NEXT: slli a1, a1, 12
+; LMULMAX8-RV64-NEXT: addi a1, a1, -241
+; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: lui a1, 4112
+; LMULMAX8-RV64-NEXT: addiw a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: slli a1, a1, 16
+; LMULMAX8-RV64-NEXT: addi a1, a1, 257
+; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: addi a1, zero, 56
+; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
+; LMULMAX8-RV64-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %x
%b = load <4 x i64>, <4 x i64>* %y
%c = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)
More information about the llvm-commits
mailing list