[llvm] 3b786f2 - [AArch64] Add intrinsic to count trailing zero elements
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 31 03:48:25 PDT 2023
Author: Kerry McLaughlin
Date: 2023-10-31T10:48:08Z
New Revision: 3b786f2c7608964b4481b9fcd24ab29c7c42243d
URL: https://github.com/llvm/llvm-project/commit/3b786f2c7608964b4481b9fcd24ab29c7c42243d
DIFF: https://github.com/llvm/llvm-project/commit/3b786f2c7608964b4481b9fcd24ab29c7c42243d.diff
LOG: [AArch64] Add intrinsic to count trailing zero elements
This patch introduces an experimental intrinsic for counting the
trailing zero elements in a vector. The intrinsic has generic expansion
in SelectionDAGBuilder, and for AArch64 there is a pattern which matches
to brkb & cntp instructions where SVE is enabled.
The intrinsic has a second operand, is_zero_poison, similar to the
existing cttz intrinsic.
These changes have been split out from D158291.
Added:
llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/IR/Intrinsics.td
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 803503a0e8cc7a1..d8679829fcd5931 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18497,6 +18497,45 @@ Arguments:
Both arguments must be vectors of the same type whereby their logical
concatenation matches the result type.
+'``llvm.experimental.cttz.elts``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ```llvm.experimental.cttz.elts```
+on any vector of integer elements, both fixed width and scalable.
+
+::
+
+ declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> <src>, i1 <is_zero_poison>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.cttz.elts``' intrinsic counts the number of trailing
+zero elements of a vector.
+
+Arguments:
+""""""""""
+
+The first argument is the vector to be counted. This argument must be a vector
+with integer element type. The return type must also be an integer type which is
+wide enough to hold the maximum number of elements of the source vector. The
+behaviour of this intrinsic is undefined if the return type is not wide enough
+for the number of elements in the input vector.
+
+The second argument is a constant flag that indicates whether the intrinsic
+returns a valid result if the first argument is all zero. If the first argument
+is all zero and the second argument is true, the result is poison.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
+significant) zero elements in a vector. If ``src == 0`` the result is the
+number of elements in the input vector.
+
'``llvm.experimental.vector.splice``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 1494f335e4936fe..c87537291e3b161 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -465,6 +465,10 @@ class TargetLoweringBase {
return true;
}
+ /// Return true if the @llvm.experimental.cttz.elts intrinsic should be
+ /// expanded using generic code in SelectionDAGBuilder.
+ virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
+
// Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
// vecreduce(op(x, y)) for the reduction opcode RedOpc.
virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index d7a81591d03443d..737fa2a41a3e2c8 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2182,6 +2182,11 @@ def int_experimental_get_vector_length:
[IntrNoMem, IntrNoSync, IntrWillReturn,
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+def int_experimental_cttz_elts:
+ DefaultAttrsIntrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty, llvm_i1_ty],
+ [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
def int_experimental_vp_splice:
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c518b1f95e9023e..229f220d8460bda 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7514,6 +7514,62 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, Trunc);
return;
}
+ case Intrinsic::experimental_cttz_elts: {
+ auto DL = getCurSDLoc();
+ SDValue Op = getValue(I.getOperand(0));
+ EVT OpVT = Op.getValueType();
+
+ if (!TLI.shouldExpandCttzElements(OpVT)) {
+ visitTargetIntrinsic(I, Intrinsic);
+ return;
+ }
+
+ if (OpVT.getScalarType() != MVT::i1) {
+ // Compare the input vector elements to zero & use to count trailing zeros
+ SDValue AllZero = DAG.getConstant(0, DL, OpVT);
+ OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ OpVT.getVectorElementCount());
+ Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
+ }
+
+ // Find the smallest "sensible" element type to use for the expansion.
+ ConstantRange CR(
+ APInt(64, OpVT.getVectorElementCount().getKnownMinValue()));
+ if (OpVT.isScalableVT())
+ CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64));
+
+ // If the zero-is-poison flag is set, we can assume the upper limit
+ // of the result is VF-1.
+ if (!cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero())
+ CR = CR.subtract(APInt(64, 1));
+
+ unsigned EltWidth = I.getType()->getScalarSizeInBits();
+ EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
+ EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
+
+ MVT NewEltTy = MVT::getIntegerVT(EltWidth);
+
+ // Create the new vector type & get the vector length
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy,
+ OpVT.getVectorElementCount());
+
+ SDValue VL =
+ DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount());
+
+ SDValue StepVec = DAG.getStepVector(DL, NewVT);
+ SDValue SplatVL = DAG.getSplat(NewVT, DL, VL);
+ SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op);
+ SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
+ SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And);
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max);
+
+ EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+ SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy);
+
+ setValue(&I, Ret);
+ return;
+ }
case Intrinsic::vector_insert: {
SDValue Vec = getValue(I.getOperand(0));
SDValue SubVec = getValue(I.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4daaf21d42b3ce4..d00db82c9e49ac2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1791,6 +1791,10 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
return false;
}
+bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
+ return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
+}
+
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
bool StreamingSVE) {
assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
@@ -2634,6 +2638,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::MRRS)
MAKE_CASE(AArch64ISD::MSRR)
MAKE_CASE(AArch64ISD::RSHRNB_I)
+ MAKE_CASE(AArch64ISD::CTTZ_ELTS)
}
#undef MAKE_CASE
return nullptr;
@@ -5338,6 +5343,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
return SDValue();
}
+ case Intrinsic::experimental_cttz_elts: {
+ SDValue NewCttzElts =
+ DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
+
+ return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
+ }
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 52e519cd8a0c93c..7332a95615a4da5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -335,6 +335,8 @@ enum NodeType : unsigned {
PTEST_ANY,
PTRUE,
+ CTTZ_ELTS,
+
BITREVERSE_MERGE_PASSTHRU,
BSWAP_MERGE_PASSTHRU,
REVH_MERGE_PASSTHRU,
@@ -927,6 +929,8 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
+ bool shouldExpandCttzElements(EVT VT) const override;
+
/// If a change in streaming mode is required on entry to/return from a
/// function call it emits and returns the corresponding SMSTART or SMSTOP node.
/// \p Entry tells whether this is before/after the Call, which is necessary
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 069a283dd311e50..d262c8bbe485ac7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -842,6 +842,9 @@ def AArch64rshrnb_pf : PatFrags<(ops node:$rs, node:$i),
[(AArch64rshrnb node:$rs, node:$i),
(int_aarch64_sve_rshrnb node:$rs, node:$i)]>;
+def AArch64CttzElts : SDNode<"AArch64ISD::CTTZ_ELTS", SDTypeProfile<1, 1,
+ [SDTCisInt<0>, SDTCisVec<1>]>, []>;
+
// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
// have no common bits.
def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1a586765d58b3ca..3faa06f995e5beb 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1964,6 +1964,11 @@ let Predicates = [HasSVEorSME] in {
defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
+ (i64 (!cast<Instruction>(CNTP_XPP_B)
+ (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)),
+ (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>;
}
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2049,6 +2054,17 @@ let Predicates = [HasSVEorSME] in {
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))),
+ (i64 (!cast<Instruction>(INCP_XP_B)
+ (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
+ GPR64:$Op1))>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))),
+ (i32 (EXTRACT_SUBREG (i64 (!cast<Instruction>(INCP_XP_B)
+ (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))),
+ sub_32))>;
+
defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
defm INDEX_RI : sve_int_index_ri<"index">;
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
new file mode 100644
index 000000000000000..1a4ab6ab334a64e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -0,0 +1,265 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+
+; WITH VSCALE RANGE
+
+define i64 @ctz_nxv8i1(<vscale x 8 x i1> %a) #0 {
+; CHECK-LABEL: ctz_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z0.h, #0, #-1
+; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: cnth x9
+; CHECK-NEXT: inch z0.h
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: and z0.h, z0.h, #0xff
+; CHECK-NEXT: umaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w8, w9, w8
+; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 0)
+ ret i64 %res
+}
+
+define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
+; CHECK-LABEL: ctz_nxv32i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z0.h, #0, #-1
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: punpklo p3.h, p1.b
+; CHECK-NEXT: rdvl x9, #2
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mov z1.h, w8
+; CHECK-NEXT: rdvl x8, #-1
+; CHECK-NEXT: punpkhi p1.h, p1.b
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: inch z0.h, all, mul #4
+; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: add z1.h, z0.h, z1.h
+; CHECK-NEXT: add z4.h, z0.h, z2.h
+; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: and z0.d, z0.d, z3.d
+; CHECK-NEXT: add z2.h, z1.h, z2.h
+; CHECK-NEXT: and z3.d, z4.d, z5.d
+; CHECK-NEXT: and z1.d, z1.d, z6.d
+; CHECK-NEXT: and z2.d, z2.d, z7.d
+; CHECK-NEXT: umax z0.h, p2/m, z0.h, z3.h
+; CHECK-NEXT: umax z1.h, p2/m, z1.h, z2.h
+; CHECK-NEXT: umax z0.h, p2/m, z0.h, z1.h
+; CHECK-NEXT: umaxv h0, p2, z0.h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w8, w9, w8
+; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: ctz_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: index z1.s, #0, #-1
+; CHECK-NEXT: cntw x9
+; CHECK-NEXT: incw z1.s
+; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
+; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: and z0.d, z1.d, z0.d
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: umaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w8, w9, w8
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
+ ret i32 %res
+}
+
+; VSCALE RANGE, ZERO IS POISON
+
+define i64 @vscale_4096(<vscale x 16 x i8> %a) #1 {
+; CHECK-LABEL: vscale_4096:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: cnth x9
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: mov z1.s, w8
+; CHECK-NEXT: neg x8, x9
+; CHECK-NEXT: rdvl x9, #1
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: index z0.s, #0, #-1
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: incw z0.s, all, mul #4
+; CHECK-NEXT: add z1.s, z0.s, z1.s
+; CHECK-NEXT: add z5.s, z0.s, z2.s
+; CHECK-NEXT: punpkhi p2.h, p1.b
+; CHECK-NEXT: punpkhi p3.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: add z2.s, z1.s, z2.s
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p2.s
+; CHECK-NEXT: mov z4.s, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: and z1.d, z1.d, z3.d
+; CHECK-NEXT: and z2.d, z2.d, z4.d
+; CHECK-NEXT: and z3.d, z5.d, z6.d
+; CHECK-NEXT: and z0.d, z0.d, z7.d
+; CHECK-NEXT: umax z1.s, p2/m, z1.s, z2.s
+; CHECK-NEXT: umax z0.s, p2/m, z0.s, z3.s
+; CHECK-NEXT: umax z0.s, p2/m, z0.s, z1.s
+; CHECK-NEXT: umaxv s0, p2, z0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8> %a, i1 0)
+ ret i64 %res
+}
+
+define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
+; CHECK-LABEL: vscale_4096_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: rdvl x9, #1
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: mov z1.h, w8
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: index z0.h, #0, #-1
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: inch z0.h, all, mul #2
+; CHECK-NEXT: add z1.h, z0.h, z1.h
+; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: and z1.d, z1.d, z2.d
+; CHECK-NEXT: and z0.d, z0.d, z3.d
+; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: umaxv h0, p0, z0.h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w8, w9, w8
+; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8> %a, i1 1)
+ ret i64 %res
+}
+
+; NO VSCALE RANGE
+
+define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1_no_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z0.s, #0, #-1
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: cnth x9
+; CHECK-NEXT: mov z1.s, w8
+; CHECK-NEXT: incw z0.s, all, mul #2
+; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add z1.s, z0.s, z1.s
+; CHECK-NEXT: and z0.d, z0.d, z2.d
+; CHECK-NEXT: and z1.d, z1.d, z3.d
+; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: umaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+; MATCH WITH BRKB + CNTP
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; CHECK-LABEL: ctz_nxv16i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; CHECK-LABEL: ctz_nxv16i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ctz_and_nxv16i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %cmp = icmp ne <vscale x 16 x i8> %a, %b
+ %select = select <vscale x 16 x i1> %pg, <vscale x 16 x i1> %cmp, <vscale x 16 x i1> zeroinitializer
+ %and = and <vscale x 16 x i1> %pg, %select
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %and, i1 0)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv16i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: incp x0, p0.b
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv16i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: incp x0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
+
+declare i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8>, i1)
+
+attributes #0 = { vscale_range(1,16) }
+attributes #1 = { vscale_range(1,4096) }
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
new file mode 100644
index 000000000000000..a7ffefdecb5f7b7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+; FIXED WIDTH
+
+define i8 @ctz_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: .LCPI0_0:
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_v8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: umaxv b0, v0.8b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i1 0)
+ ret i8 %res
+}
+
+define i32 @ctz_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: .LCPI1_0:
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 12
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_v16i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: mov w9, #16 // =0x10
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umaxv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w8, w9, w8
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i16 @ctz_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: .LCPI2_0:
+; CHECK-NEXT: .hword 4
+; CHECK-NEXT: .hword 3
+; CHECK-NEXT: .hword 2
+; CHECK-NEXT: .hword 1
+; CHECK-LABEL: ctz_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: mov w9, #4 // =0x4
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: umaxv h0, v0.4h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w8, w9, w8
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
+ ret i16 %res
+}
+
+define i7 @ctz_i7_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: .LCPI3_0:
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_i7_v8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: umaxv b0, v0.8b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ret
+ %res = call i7 @llvm.experimental.cttz.elts.i7.v8i1(<8 x i1> %a, i1 0)
+ ret i7 %res
+}
+
+; ZERO IS POISON
+
+define i8 @ctz_v8i1_poison(<8 x i1> %a) {
+; CHECK-LABEL: .LCPI4_0:
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_v8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
+; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: umaxv b0, v0.8b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i1 1)
+ ret i8 %res
+}
+
+declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1>, i1)
+declare i7 @llvm.experimental.cttz.elts.i7.v8i1(<8 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
new file mode 100644
index 000000000000000..862911ffa1863a1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv32 -mattr=+v < %s | FileCheck %s -check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v < %s | FileCheck %s -check-prefix=RV64
+
+; WITH VSCALE RANGE
+
+define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
+; RV32-LABEL: ctz_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.v.x v10, a0
+; RV32-NEXT: vid.v v11
+; RV32-NEXT: li a1, -1
+; RV32-NEXT: vmadd.vx v11, a1, v10
+; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT: vmsne.vi v0, v8, 0
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vand.vv v8, v11, v8
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: lui a1, 16
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctz_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.v.x v10, a0
+; RV64-NEXT: vid.v v11
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: vmadd.vx v11, a1, v10
+; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT: vmsne.vi v0, v8, 0
+; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vand.vv v8, v11, v8
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: addiw a1, a1, -1
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
+ ret i32 %res
+}
+
+; NO VSCALE RANGE
+
+define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
+; RV32-LABEL: ctz_nxv8i1_no_range:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-NEXT: addi a0, sp, 32
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: srli a0, a0, 3
+; RV32-NEXT: li a2, 8
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3 at plt
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: addi a2, sp, 20
+; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vid.v v8
+; RV32-NEXT: li a2, -1
+; RV32-NEXT: vmadd.vx v8, a2, v16
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT: addi a2, sp, 32
+; RV32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vmsne.vi v0, v16, 0
+; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV32-NEXT: vmv.v.i v16, 0
+; RV32-NEXT: vmerge.vim v16, v16, -1, v0
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: sltu a3, a0, a2
+; RV32-NEXT: li a4, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vx v8, v8, a4
+; RV32-NEXT: vmv.x.s a4, v8
+; RV32-NEXT: sub a1, a1, a4
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add sp, sp, a2
+; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctz_nxv8i1_no_range:
+; RV64: # %bb.0:
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v24, a0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: vmadd.vx v16, a1, v24
+; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT: vmsne.vi v0, v8, 0
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16> %a, i1 0)
+ ret i64 %res
+}
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; RV32-LABEL: ctz_nxv16i1:
+; RV32: # %bb.0:
+; RV32-NEXT: vmv1r.v v0, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: li a1, -1
+; RV32-NEXT: vmadd.vx v16, a1, v8
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctz_nxv16i1:
+; RV64: # %bb.0:
+; RV64-NEXT: vmv1r.v v0, v8
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: vmadd.vx v16, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vand.vv v8, v16, v8
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
+
+attributes #0 = { vscale_range(2,1024) }
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
new file mode 100644
index 000000000000000..15abc9b75883c8f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -check-prefix=RV32
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s -check-prefix=RV64
+
+; FIXED WIDTH
+
+define i16 @ctz_v4i32(<4 x i32> %a) {
+; RV32-LABEL: ctz_v4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a3, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: lw a2, 12(a0)
+; RV32-NEXT: lw a4, 8(a0)
+; RV32-NEXT: seqz a0, a3
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: andi a0, a0, 4
+; RV32-NEXT: seqz a3, a4
+; RV32-NEXT: addi a3, a3, -1
+; RV32-NEXT: andi a3, a3, 2
+; RV32-NEXT: bltu a3, a0, .LBB0_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a0, a3
+; RV32-NEXT: .LBB0_2:
+; RV32-NEXT: snez a2, a2
+; RV32-NEXT: seqz a1, a1
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi a1, a1, 3
+; RV32-NEXT: bltu a2, a1, .LBB0_4
+; RV32-NEXT: # %bb.3:
+; RV32-NEXT: mv a1, a2
+; RV32-NEXT: .LBB0_4:
+; RV32-NEXT: bltu a1, a0, .LBB0_6
+; RV32-NEXT: # %bb.5:
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: .LBB0_6:
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: andi a0, a1, 255
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctz_v4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: lw a3, 0(a0)
+; RV64-NEXT: lw a1, 8(a0)
+; RV64-NEXT: lw a2, 24(a0)
+; RV64-NEXT: lw a4, 16(a0)
+; RV64-NEXT: seqz a0, a3
+; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: andi a0, a0, 4
+; RV64-NEXT: seqz a3, a4
+; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: andi a3, a3, 2
+; RV64-NEXT: bltu a3, a0, .LBB0_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a0, a3
+; RV64-NEXT: .LBB0_2:
+; RV64-NEXT: snez a2, a2
+; RV64-NEXT: seqz a1, a1
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: andi a1, a1, 3
+; RV64-NEXT: bltu a2, a1, .LBB0_4
+; RV64-NEXT: # %bb.3:
+; RV64-NEXT: mv a1, a2
+; RV64-NEXT: .LBB0_4:
+; RV64-NEXT: bltu a1, a0, .LBB0_6
+; RV64-NEXT: # %bb.5:
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: .LBB0_6:
+; RV64-NEXT: li a1, 4
+; RV64-NEXT: subw a1, a1, a0
+; RV64-NEXT: andi a0, a1, 255
+; RV64-NEXT: ret
+ %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
+ ret i16 %res
+}
+
+; ZERO IS POISON
+
+define i32 @ctz_v2i1_poison(<2 x i1> %a) {
+; RV32-LABEL: ctz_v2i1_poison:
+; RV32: # %bb.0:
+; RV32-NEXT: andi a1, a1, 1
+; RV32-NEXT: slli a0, a0, 31
+; RV32-NEXT: srai a0, a0, 31
+; RV32-NEXT: andi a0, a0, 2
+; RV32-NEXT: bltu a1, a0, .LBB1_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: li a1, 2
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: andi a0, a1, 255
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctz_v2i1_poison:
+; RV64: # %bb.0:
+; RV64-NEXT: andi a1, a1, 1
+; RV64-NEXT: slli a0, a0, 63
+; RV64-NEXT: srai a0, a0, 63
+; RV64-NEXT: andi a0, a0, 2
+; RV64-NEXT: bltu a1, a0, .LBB1_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: .LBB1_2:
+; RV64-NEXT: li a1, 2
+; RV64-NEXT: subw a1, a1, a0
+; RV64-NEXT: andi a0, a1, 255
+; RV64-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+declare i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)
diff --git a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
new file mode 100644
index 000000000000000..1eb43db350447a7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
@@ -0,0 +1,134 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+define i8 @ctz_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: .LCPI0_0:
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: pcmpeqw %xmm0, %xmm1
+; CHECK-NEXT: packsswb %xmm1, %xmm1
+; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmoval %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movb $8, %al
+; CHECK-NEXT: subb %cl, %al
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16> %a, i1 0)
+ ret i8 %res
+}
+
+define i16 @ctz_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: .LCPI1_0:
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm1
+; CHECK-NEXT: packssdw %xmm1, %xmm1
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: packsswb %xmm0, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $8, %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmoval %eax, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $16, %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmoval %ecx, %edx
+; CHECK-NEXT: shrl $24, %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmoval %edx, %eax
+; CHECK-NEXT: movb $4, %cl
+; CHECK-NEXT: subb %al, %cl
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
+ ret i16 %res
+}
+
+; ZERO IS POISON
+
+define i8 @ctz_v8i16_poison(<8 x i16> %a) {
+; CHECK-LABEL: .LCPI2_0:
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 1
+; CHECK-LABEL: ctz_v8i16_poison:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: pcmpeqw %xmm0, %xmm1
+; CHECK-NEXT: packsswb %xmm1, %xmm1
+; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmoval %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmovbel %eax, %ecx
+; CHECK-NEXT: movb $8, %al
+; CHECK-NEXT: subb %cl, %al
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16> %a, i1 1)
+ ret i8 %res
+}
+
+declare i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)
More information about the llvm-commits
mailing list