[llvm] [AArch64] Add @llvm.experimental.vector.match (PR #101974)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 7 07:12:10 PDT 2024
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/101974
>From 3e95323988f3e8da7a8a1253f8491ebe43b05c73 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 19 Jul 2024 16:10:51 +0100
Subject: [PATCH 1/2] [AArch64] Add @llvm.experimental.vector.match
This patch introduces an experimental intrinsic for matching the
elements of one vector against the elements of another.
For AArch64 targets that support SVE2, it lowers to a MATCH instruction
for supported fixed and scalar types.
---
llvm/docs/LangRef.rst | 53 +++++++++++++++++
.../llvm/Analysis/TargetTransformInfo.h | 9 +++
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
llvm/include/llvm/IR/Intrinsics.td | 10 ++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
.../SelectionDAG/SelectionDAGBuilder.cpp | 9 +++
llvm/lib/IR/Verifier.cpp | 28 +++++++++
.../Target/AArch64/AArch64ISelLowering.cpp | 46 +++++++++++++++
.../AArch64/AArch64TargetTransformInfo.cpp | 12 ++++
.../AArch64/AArch64TargetTransformInfo.h | 2 +
.../AArch64/intrinsic-vector-match-sve2.ll | 57 +++++++++++++++++++
11 files changed, 233 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 6fa35486669d69..b125a20bc437c6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19958,6 +19958,59 @@ are undefined.
}
+'``llvm.experimental.vector.match.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. Support for specific vector types is target
+dependent.
+
+::
+
+ declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<n> x <ty>> %op2, <<n> x i1> %mask, i32 <segsize>)
+ declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <vscale x <n> x <ty>> %op2, <vscale x <n> x i1> %mask, i32 <segsize>)
+
+Overview:
+"""""""""
+
+Find elements of the first argument matching any elements of the second.
+
+Arguments:
+""""""""""
+
+The first argument is the search vector, the second argument is the vector of
+elements we are searching for (i.e. for which we consider a match successful),
+and the third argument is a mask that controls which elements of the first
+argument are active. The fourth argument is an immediate that sets the segment
+size for the search window.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.match``' intrinsic compares each element in the
+first argument against potentially several elements of the second, placing
+``1`` in the corresponding element of the output vector if any comparison is
+successful, and ``0`` otherwise. Inactive elements in the mask are set to ``0``
+in the output. The segment size controls the number of elements of the second
+argument that are compared against.
+
+For example, for vectors with 16 elements, if ``segsize = 16`` then each
+element of the first argument is compared against all 16 elements of the second
+argument; but if ``segsize = 4``, then each of the first four elements of the
+first argument is compared against the first four elements of the second
+argument, each of the second four elements of the first argument is compared
+against the second four elements of the second argument, and so forth.
+
+Currently, ``segsize`` needs to be an immediate value. The special value of
+``-1`` is allowed to indicate all elements should be searched.
+
+Support for specific vector types is target dependent. For AArch64 targets with
+SVE2 support, the intrinsic is valid on ``<16 x i8>`` or ``<8 x i16>`` vectors,
+or the scalable equivalents, with a ``segsize`` equal to the known minimum
+number of elements of the vectors (16 or 8, respectively).
+
Matrix Intrinsics
-----------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 89a85bc8a90864..67dca918bc8795 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1744,6 +1744,10 @@ class TargetTransformInfo {
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const;
+ /// \returns Returns true if the target supports vector match operations for
+ /// the vector type `VT` using a segment size of `SegSize`.
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
struct VPLegalization {
enum VPTransform {
// keep the predicating parameter
@@ -2182,6 +2186,7 @@ class TargetTransformInfo::Concept {
virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const = 0;
+ virtual bool hasVectorMatch(VectorType *VT, unsigned SegSize) const = 0;
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2952,6 +2957,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
}
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const override {
+ return Impl.hasVectorMatch(VT, SegSize);
+ }
+
VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 50040dc8f6165b..8130f2727e9d19 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -972,6 +972,8 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }
+
TargetTransformInfo::VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const {
return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 20dd921ddbd230..0391119c5aca33 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1912,6 +1912,16 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
[ IntrArgMemOnly ]>;
+// Experimental match
+def int_experimental_vector_match : DefaultAttrsIntrinsic<
+ [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+ [ llvm_anyvector_ty,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Mask
+ llvm_i32_ty ], // Segment size
+ [ IntrNoMem, IntrNoSync, IntrWillReturn,
+ ImmArg<ArgIndex<3>> ]>;
+
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
// Integer arithmetic
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b5195f764cbd1c..dcf8eda6504416 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1354,6 +1354,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}
+bool TargetTransformInfo::hasVectorMatch(VectorType *VT,
+ unsigned SegSize) const {
+ return TTIImpl->hasVectorMatch(VT, SegSize);
+}
+
TargetTransformInfo::Concept::~Concept() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 25213f587116d5..66911a9075edaf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8137,6 +8137,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
return;
}
+ case Intrinsic::experimental_vector_match: {
+ auto *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
+ auto SegmentSize = cast<ConstantInt>(I.getOperand(3))->getLimitedValue();
+ const auto &TTI =
+ TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
+ assert(VT && TTI.hasVectorMatch(VT, SegmentSize) && "Unsupported type!");
+ visitTargetIntrinsic(I, Intrinsic);
+ return;
+ }
case Intrinsic::vector_reverse:
visitVectorReverse(I);
return;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 1cd5eb36c4ab69..97d8073edeb798 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6108,6 +6108,34 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
&Call);
break;
}
+ case Intrinsic::experimental_vector_match: {
+ Value *Op1 = Call.getArgOperand(0);
+ Value *Op2 = Call.getArgOperand(1);
+ Value *Mask = Call.getArgOperand(2);
+ Value *SegSize = Call.getArgOperand(3);
+
+ VectorType *OpTy = dyn_cast<VectorType>(Op1->getType());
+ VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
+ Check(OpTy && MaskTy, "experimental.vector.match operands are not vectors.",
+ &Call);
+ Check(Op2->getType() == OpTy,
+ "experimental.vector.match first two operands must have matching "
+ "types.",
+ &Call);
+ Check(isa<ConstantInt>(SegSize),
+ "experimental.vector.match segment size needs to be an immediate "
+ "integer.",
+ &Call);
+
+ ElementCount EC = OpTy->getElementCount();
+ Check(MaskTy->getElementCount() == EC,
+ "experimental.vector.match mask must have the same number of "
+ "elements as the remaining vector operands.",
+ &Call);
+ Check(MaskTy->getElementType()->isIntegerTy(1),
+ "experimental.vector.match mask element type is not i1.", &Call);
+ break;
+ }
case Intrinsic::vector_insert: {
Value *Vec = Call.getArgOperand(0);
Value *SubVec = Call.getArgOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48e1b96d841efb..bf3012d953929b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6255,6 +6255,51 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
}
+ case Intrinsic::experimental_vector_match: {
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
+
+ auto Op1 = Op.getOperand(1);
+ auto Op2 = Op.getOperand(2);
+ auto Mask = Op.getOperand(3);
+ auto SegmentSize =
+ cast<ConstantSDNode>(Op.getOperand(4))->getLimitedValue();
+
+ EVT VT = Op.getValueType();
+ auto MinNumElts = VT.getVectorMinNumElements();
+
+ assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
+ assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
+ "Custom lower only works on 128-bit segments.");
+ assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
+ Op1.getValueType().getVectorElementType() == MVT::i16) &&
+ "Custom lower only supports 8-bit or 16-bit characters.");
+ assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
+ "match minimum number of elements.");
+
+ if (VT.isScalableVector())
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Mask, Op1, Op2);
+
+ // We can use the SVE2 match instruction to lower this intrinsic by
+ // converting the operands to scalable vectors, doing a match, and then
+ // extracting a fixed-width subvector from the scalable vector.
+
+ EVT OpVT = Op1.getValueType();
+ EVT OpContainerVT = getContainerForFixedLengthVector(DAG, OpVT);
+ EVT MatchVT = OpContainerVT.changeElementType(MVT::i1);
+
+ auto ScalableOp1 = convertToScalableVector(DAG, OpContainerVT, Op1);
+ auto ScalableOp2 = convertToScalableVector(DAG, OpContainerVT, Op2);
+ auto ScalableMask = DAG.getNode(ISD::SIGN_EXTEND, dl, OpVT, Mask);
+ ScalableMask = convertFixedMaskToScalableVector(ScalableMask, DAG);
+
+ SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID,
+ ScalableMask, ScalableOp1, ScalableOp2);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match),
+ DAG.getVectorIdxConstant(0, dl));
+ }
}
}
@@ -27304,6 +27349,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
return;
}
+ case Intrinsic::experimental_vector_match:
case Intrinsic::get_active_lane_mask: {
if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
return;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 80d5168ae961ab..db7da51060428c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4041,6 +4041,18 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
}
}
+bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
+ // Check that the target has SVE2 (and SVE is available), that `VT' is a
+ // legal type for MATCH, and that the segment size is 128-bit.
+ if (ST->hasSVE2() && ST->isSVEAvailable() &&
+ VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
+ VT->getElementCount().getKnownMinValue() == SegSize &&
+ (VT->getElementCount().getKnownMinValue() == 8 ||
+ VT->getElementCount().getKnownMinValue() == 16))
+ return true;
+ return false;
+}
+
InstructionCost
AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 28e45207596ecd..78550464318d22 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -392,6 +392,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->hasSVE();
}
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind);
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
new file mode 100644
index 00000000000000..0df92dfa800006
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define <vscale x 16 x i1> @match_nxv16i8(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask, i32 16)
+ ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 8 x i1> @match_nxv8i16(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask, i32 8)
+ ret <vscale x 8 x i1> %r
+}
+
+define <16 x i1> @match_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v2.16b, v2.16b, #7
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
+; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
+; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask, i32 16)
+ ret <16 x i1> %r
+}
+
+define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
+; CHECK-LABEL: match_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: shl v2.8h, v2.8h, #15
+; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0
+; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: ret
+ %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask, i32 8)
+ ret <8 x i1> %r
+}
+
+attributes #0 = { "target-features"="+sve2" }
>From 9df5ff0d939d89ef99bc78832fa9347c9cc69ade Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 3 Oct 2024 09:21:22 -0700
Subject: [PATCH 2/2] Remove SegSize and provide general lowering
---
llvm/docs/LangRef.rst | 36 +--
.../llvm/Analysis/TargetTransformInfo.h | 13 +-
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-
llvm/include/llvm/IR/Intrinsics.td | 8 +-
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 35 ++-
llvm/lib/IR/Verifier.cpp | 27 +--
.../Target/AArch64/AArch64ISelLowering.cpp | 65 ++---
.../AArch64/AArch64TargetTransformInfo.cpp | 22 +-
.../AArch64/AArch64TargetTransformInfo.h | 2 +-
.../AArch64/intrinsic-vector-match-sve2.ll | 228 ++++++++++++++++--
11 files changed, 333 insertions(+), 111 deletions(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b125a20bc437c6..0d0742b54663ba 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19969,47 +19969,33 @@ dependent.
::
- declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<n> x <ty>> %op2, <<n> x i1> %mask, i32 <segsize>)
- declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <vscale x <n> x <ty>> %op2, <vscale x <n> x i1> %mask, i32 <segsize>)
+ declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<m> x <ty>> %op2, <<n> x i1> %mask)
+ declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <<m> x <ty>> %op2, <vscale x <n> x i1> %mask)
Overview:
"""""""""
-Find elements of the first argument matching any elements of the second.
+Find active elements of the first argument matching any elements of the second.
Arguments:
""""""""""
-The first argument is the search vector, the second argument is the vector of
+The first argument is the search vector, the second argument the vector of
elements we are searching for (i.e. for which we consider a match successful),
and the third argument is a mask that controls which elements of the first
-argument are active. The fourth argument is an immediate that sets the segment
-size for the search window.
+argument are active.
Semantics:
""""""""""
-The '``llvm.experimental.vector.match``' intrinsic compares each element in the
-first argument against potentially several elements of the second, placing
+The '``llvm.experimental.vector.match``' intrinsic compares each active element
+in the first argument against the elements of the second argument, placing
``1`` in the corresponding element of the output vector if any comparison is
successful, and ``0`` otherwise. Inactive elements in the mask are set to ``0``
-in the output. The segment size controls the number of elements of the second
-argument that are compared against.
-
-For example, for vectors with 16 elements, if ``segsize = 16`` then each
-element of the first argument is compared against all 16 elements of the second
-argument; but if ``segsize = 4``, then each of the first four elements of the
-first argument is compared against the first four elements of the second
-argument, each of the second four elements of the first argument is compared
-against the second four elements of the second argument, and so forth.
-
-Currently, ``segsize`` needs to be an immediate value. The special value of
-``-1`` is allowed to indicate all elements should be searched.
-
-Support for specific vector types is target dependent. For AArch64 targets with
-SVE2 support, the intrinsic is valid on ``<16 x i8>`` or ``<8 x i16>`` vectors,
-or the scalable equivalents, with a ``segsize`` equal to the known minimum
-number of elements of the vectors (16 or 8, respectively).
+in the output.
+
+The second argument needs to be a fixed-length vector with the same element
+type as the first argument.
Matrix Intrinsics
-----------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 67dca918bc8795..5dbdfc06ac5786 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1744,9 +1744,10 @@ class TargetTransformInfo {
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const;
- /// \returns Returns true if the target supports vector match operations for
- /// the vector type `VT` using a segment size of `SegSize`.
- bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+ /// \returns True if the target has hardware support for vector match
+ /// operations between vectors of type `VT` and search vectors of `SearchSize`
+ /// elements, and false otherwise.
+ bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const;
struct VPLegalization {
enum VPTransform {
@@ -2186,7 +2187,7 @@ class TargetTransformInfo::Concept {
virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const = 0;
- virtual bool hasVectorMatch(VectorType *VT, unsigned SegSize) const = 0;
+ virtual bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const = 0;
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2957,8 +2958,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
}
- bool hasVectorMatch(VectorType *VT, unsigned SegSize) const override {
- return Impl.hasVectorMatch(VT, SegSize);
+ bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const override {
+ return Impl.hasVectorMatch(VT, SearchSize);
}
VPLegalization
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 8130f2727e9d19..e618f774bac4d7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -972,7 +972,9 @@ class TargetTransformInfoImplBase {
return false;
}
- bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }
+ bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const {
+ return false;
+ }
TargetTransformInfo::VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 0391119c5aca33..4447db7ddfcebe 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1916,11 +1916,9 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
def int_experimental_vector_match : DefaultAttrsIntrinsic<
[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
[ llvm_anyvector_ty,
- LLVMMatchType<0>,
- LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Mask
- llvm_i32_ty ], // Segment size
- [ IntrNoMem, IntrNoSync, IntrWillReturn,
- ImmArg<ArgIndex<3>> ]>;
+ llvm_anyvector_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], // Mask
+ [ IntrNoMem, IntrNoSync, IntrWillReturn ]>;
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index dcf8eda6504416..3611178cbe43c6 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1355,8 +1355,8 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
}
bool TargetTransformInfo::hasVectorMatch(VectorType *VT,
- unsigned SegSize) const {
- return TTIImpl->hasVectorMatch(VT, SegSize);
+ unsigned SearchSize) const {
+ return TTIImpl->hasVectorMatch(VT, SearchSize);
}
TargetTransformInfo::Concept::~Concept() = default;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 66911a9075edaf..3a3d8e5c8ea255 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8138,12 +8138,39 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::experimental_vector_match: {
- auto *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
- auto SegmentSize = cast<ConstantInt>(I.getOperand(3))->getLimitedValue();
+ SDValue Op1 = getValue(I.getOperand(0));
+ SDValue Op2 = getValue(I.getOperand(1));
+ SDValue Mask = getValue(I.getOperand(2));
+ EVT Op1VT = Op1.getValueType();
+ EVT Op2VT = Op2.getValueType();
+ EVT ResVT = Mask.getValueType();
+ unsigned SearchSize = Op2VT.getVectorNumElements();
+
+ LLVMContext &Ctx = *DAG.getContext();
const auto &TTI =
TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
- assert(VT && TTI.hasVectorMatch(VT, SegmentSize) && "Unsupported type!");
- visitTargetIntrinsic(I, Intrinsic);
+
+ // If the target has native support for this vector match operation, lower
+ // the intrinsic directly; otherwise, lower it below.
+ if (TTI.hasVectorMatch(cast<VectorType>(Op1VT.getTypeForEVT(Ctx)),
+ SearchSize)) {
+ visitTargetIntrinsic(I, Intrinsic);
+ return;
+ }
+
+ SDValue Ret = DAG.getNode(ISD::SPLAT_VECTOR, sdl, ResVT,
+ DAG.getConstant(0, sdl, MVT::i1));
+
+ for (unsigned i = 0; i < SearchSize; ++i) {
+ SDValue Op2Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl,
+ Op2VT.getVectorElementType(), Op2,
+ DAG.getVectorIdxConstant(i, sdl));
+ SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, sdl, Op1VT, Op2Elem);
+ SDValue Cmp = DAG.getSetCC(sdl, ResVT, Op1, Splat, ISD::SETEQ);
+ Ret = DAG.getNode(ISD::OR, sdl, ResVT, Ret, Cmp);
+ }
+
+ setValue(&I, DAG.getNode(ISD::AND, sdl, ResVT, Ret, Mask));
return;
}
case Intrinsic::vector_reverse:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 97d8073edeb798..58b1f3f44d9d45 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6112,28 +6112,21 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
Value *Op1 = Call.getArgOperand(0);
Value *Op2 = Call.getArgOperand(1);
Value *Mask = Call.getArgOperand(2);
- Value *SegSize = Call.getArgOperand(3);
- VectorType *OpTy = dyn_cast<VectorType>(Op1->getType());
+ VectorType *Op1Ty = dyn_cast<VectorType>(Op1->getType());
+ VectorType *Op2Ty = dyn_cast<VectorType>(Op2->getType());
VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
- Check(OpTy && MaskTy, "experimental.vector.match operands are not vectors.",
- &Call);
- Check(Op2->getType() == OpTy,
- "experimental.vector.match first two operands must have matching "
- "types.",
- &Call);
- Check(isa<ConstantInt>(SegSize),
- "experimental.vector.match segment size needs to be an immediate "
- "integer.",
- &Call);
- ElementCount EC = OpTy->getElementCount();
- Check(MaskTy->getElementCount() == EC,
- "experimental.vector.match mask must have the same number of "
- "elements as the remaining vector operands.",
+ Check(Op1Ty && Op2Ty && MaskTy, "Operands must be vectors.", &Call);
+ Check(!isa<ScalableVectorType>(Op2Ty), "Second operand cannot be scalable.",
+ &Call);
+ Check(Op1Ty->getElementType() == Op2Ty->getElementType(),
+ "First two operands must have the same element type.", &Call);
+ Check(Op1Ty->getElementCount() == MaskTy->getElementCount(),
+ "First operand and mask must have the same number of elements.",
&Call);
Check(MaskTy->getElementType()->isIntegerTy(1),
- "experimental.vector.match mask element type is not i1.", &Call);
+ "Mask must be a vector of i1's.", &Call);
break;
}
case Intrinsic::vector_insert: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf3012d953929b..5f0bd4aa720658 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6262,41 +6262,48 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
auto Op1 = Op.getOperand(1);
auto Op2 = Op.getOperand(2);
auto Mask = Op.getOperand(3);
- auto SegmentSize =
- cast<ConstantSDNode>(Op.getOperand(4))->getLimitedValue();
- EVT VT = Op.getValueType();
- auto MinNumElts = VT.getVectorMinNumElements();
-
- assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
- assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
- "Custom lower only works on 128-bit segments.");
- assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
- Op1.getValueType().getVectorElementType() == MVT::i16) &&
- "Custom lower only supports 8-bit or 16-bit characters.");
- assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
- "match minimum number of elements.");
-
- if (VT.isScalableVector())
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Mask, Op1, Op2);
-
- // We can use the SVE2 match instruction to lower this intrinsic by
- // converting the operands to scalable vectors, doing a match, and then
- // extracting a fixed-width subvector from the scalable vector.
+ EVT Op1VT = Op1.getValueType();
+ EVT Op2VT = Op2.getValueType();
+ EVT ResVT = Op.getValueType();
- EVT OpVT = Op1.getValueType();
- EVT OpContainerVT = getContainerForFixedLengthVector(DAG, OpVT);
+ assert((Op1VT.getVectorElementType() == MVT::i8 ||
+ Op1VT.getVectorElementType() == MVT::i16) &&
+ "Expected 8-bit or 16-bit characters.");
+ assert(!Op2VT.isScalableVector() && "Search vector cannot be scalable.");
+ assert(Op1VT.getVectorElementType() == Op2VT.getVectorElementType() &&
+ "Operand type mismatch.");
+ assert(Op1VT.getVectorMinNumElements() == Op2VT.getVectorNumElements() &&
+ "Invalid operands.");
+
+ // Wrap the search vector in a scalable vector.
+ EVT OpContainerVT = getContainerForFixedLengthVector(DAG, Op2VT);
+ Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
+
+ // If the result is scalable, we need to broadbast the search vector across
+ // the SVE register and then carry out the MATCH.
+ if (ResVT.isScalableVector()) {
+ Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
+ DAG.getTargetConstant(0, dl, MVT::i64));
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1,
+ Op2);
+ }
+
+ // If the result is fixed, we can still use MATCH but we need to wrap the
+ // first operand and the mask in scalable vectors before doing so.
EVT MatchVT = OpContainerVT.changeElementType(MVT::i1);
- auto ScalableOp1 = convertToScalableVector(DAG, OpContainerVT, Op1);
- auto ScalableOp2 = convertToScalableVector(DAG, OpContainerVT, Op2);
- auto ScalableMask = DAG.getNode(ISD::SIGN_EXTEND, dl, OpVT, Mask);
- ScalableMask = convertFixedMaskToScalableVector(ScalableMask, DAG);
+ // Wrap the operands.
+ Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
+ Mask = DAG.getNode(ISD::ANY_EXTEND, dl, Op1VT, Mask);
+ Mask = convertFixedMaskToScalableVector(Mask, DAG);
- SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID,
- ScalableMask, ScalableOp1, ScalableOp2);
+ // Carry out the match.
+ SDValue Match =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID, Mask, Op1, Op2);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT,
+ // Extract and return the result.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op1VT,
DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match),
DAG.getVectorIdxConstant(0, dl));
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index db7da51060428c..4cec3f6b369564 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4041,14 +4041,26 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
}
}
-bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
- // Check that the target has SVE2 (and SVE is available), that `VT' is a
- // legal type for MATCH, and that the segment size is 128-bit.
+bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SearchSize) const {
+ // Check that (i) the target has SVE2 and SVE is available, (ii) `VT' is a
+ // legal type for MATCH, and (iii) the search vector can be broadcast
+ // efficently to a legal type.
+ //
+ // Currently, we require the length of the search vector to match the minimum
+ // number of elements of `VT'. In practice this means we only support the
+ // cases (nxv16i8, 16), (v16i8, 16), (nxv8i16, 8), and (v8i16, 8), where the
+ // first element of the tuples corresponds to the type of the first argument
+ // and the second the length of the search vector.
+ //
+ // In the future we can support more cases. For example, (nxv16i8, 4) could
+ // be efficiently supported by using a DUP.S to broadcast the search
+ // elements, and more exotic cases like (nxv16i8, 5) could be supported by a
+ // sequence of SEL(DUP).
if (ST->hasSVE2() && ST->isSVEAvailable() &&
VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
- VT->getElementCount().getKnownMinValue() == SegSize &&
(VT->getElementCount().getKnownMinValue() == 8 ||
- VT->getElementCount().getKnownMinValue() == 16))
+ VT->getElementCount().getKnownMinValue() == 16) &&
+ VT->getElementCount().getKnownMinValue() == SearchSize)
return true;
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 78550464318d22..bcc992184f1b38 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -392,7 +392,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->hasSVE();
}
- bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+ bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const;
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
index 0df92dfa800006..d84a54f327a9bc 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
@@ -1,41 +1,239 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
-define <vscale x 16 x i1> @match_nxv16i8(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
-; CHECK-LABEL: match_nxv16i8:
+define <vscale x 16 x i1> @match_nxv16i8_v1i8(<vscale x 16 x i8> %op1, <1 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8_v1i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.b[0]
+; CHECK-NEXT: mov z1.b, w8
+; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <1 x i8> %op2, <vscale x 16 x i1> %mask)
+ ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v2i8(<vscale x 16 x i8> %op1, <2 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8_v2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: mov z2.b, w9
+; CHECK-NEXT: mov z1.b, w8
+; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b
+; CHECK-NEXT: cmpeq p1.b, p1/z, z0.b, z2.b
+; CHECK-NEXT: sel p1.b, p1, p1.b, p2.b
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <2 x i8> %op2, <vscale x 16 x i1> %mask)
+ ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v4i8(<vscale x 16 x i8> %op1, <4 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8_v4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: umov w9, v1.h[0]
+; CHECK-NEXT: umov w10, v1.h[2]
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: mov z3.b, w9
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: mov z1.b, w10
+; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z2.b
+; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z1.b
+; CHECK-NEXT: cmpeq p1.b, p1/z, z0.b, z2.b
+; CHECK-NEXT: mov p2.b, p3/m, p3.b
+; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <4 x i8> %op2, <vscale x 16 x i1> %mask)
+ ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v8i8(<vscale x 16 x i8> %op1, <8 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.b[1]
+; CHECK-NEXT: umov w9, v1.b[0]
+; CHECK-NEXT: umov w10, v1.b[2]
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: mov z3.b, w9
+; CHECK-NEXT: umov w8, v1.b[3]
+; CHECK-NEXT: mov z4.b, w10
+; CHECK-NEXT: umov w9, v1.b[4]
+; CHECK-NEXT: umov w10, v1.b[7]
+; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z2.b
+; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: umov w8, v1.b[5]
+; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
+; CHECK-NEXT: mov z3.b, w9
+; CHECK-NEXT: umov w9, v1.b[6]
+; CHECK-NEXT: mov p2.b, p3/m, p3.b
+; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z2.b
+; CHECK-NEXT: mov z1.b, w8
+; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
+; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
+; CHECK-NEXT: mov z2.b, w9
+; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
+; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
+; CHECK-NEXT: mov z1.b, w10
+; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
+; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z2.b
+; CHECK-NEXT: cmpeq p1.b, p1/z, z0.b, z1.b
+; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
+; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov p1.b, p2/m, p2.b
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <8 x i8> %op2, <vscale x 16 x i1> %mask)
+ ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v16i8(<vscale x 16 x i8> %op1, <16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z1.q, q1
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: ret
- %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask, i32 16)
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <16 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
-define <vscale x 8 x i1> @match_nxv8i16(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
-; CHECK-LABEL: match_nxv8i16:
+define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8_v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: dup v1.16b, v1.b[0]
+; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
- %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask, i32 8)
- ret <vscale x 8 x i1> %r
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask)
+ ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v2i8(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8_v2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: dup v3.16b, v1.b[4]
+; CHECK-NEXT: dup v1.16b, v1.b[0]
+; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ret
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask)
+ ret <16 x i1> %r
}
-define <16 x i1> @match_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
-; CHECK-LABEL: match_v16i8:
+define <16 x i1> @match_v16i8_v4i8(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8_v4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: dup v3.16b, v1.b[2]
+; CHECK-NEXT: dup v4.16b, v1.b[0]
+; CHECK-NEXT: dup v5.16b, v1.b[4]
+; CHECK-NEXT: dup v1.16b, v1.b[6]
+; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmeq v5.16b, v0.16b, v5.16b
+; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr v1.16b, v4.16b, v3.16b
+; CHECK-NEXT: orr v0.16b, v5.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ret
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask)
+ ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v8i8(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: dup v3.16b, v1.b[1]
+; CHECK-NEXT: dup v4.16b, v1.b[0]
+; CHECK-NEXT: dup v5.16b, v1.b[2]
+; CHECK-NEXT: dup v6.16b, v1.b[3]
+; CHECK-NEXT: dup v7.16b, v1.b[4]
+; CHECK-NEXT: dup v16.16b, v1.b[5]
+; CHECK-NEXT: dup v17.16b, v1.b[6]
+; CHECK-NEXT: dup v1.16b, v1.b[7]
+; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmeq v5.16b, v0.16b, v5.16b
+; CHECK-NEXT: cmeq v6.16b, v0.16b, v6.16b
+; CHECK-NEXT: cmeq v7.16b, v0.16b, v7.16b
+; CHECK-NEXT: cmeq v16.16b, v0.16b, v16.16b
+; CHECK-NEXT: orr v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: orr v4.16b, v5.16b, v6.16b
+; CHECK-NEXT: orr v5.16b, v7.16b, v16.16b
+; CHECK-NEXT: cmeq v6.16b, v0.16b, v17.16b
+; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: orr v4.16b, v5.16b, v6.16b
+; CHECK-NEXT: orr v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: orr v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ret
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask)
+ ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v2.16b, v2.16b, #7
; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
- %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask, i32 16)
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
+define <vscale x 8 x i1> @match_nxv8i16_v8i16(<vscale x 8 x i16> %op1, <8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv8i16_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov z1.q, q1
+; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <8 x i16> %op2, <vscale x 8 x i1> %mask)
+ ret <vscale x 8 x i1> %r
+}
+
define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
; CHECK-LABEL: match_v8i16:
; CHECK: // %bb.0:
@@ -43,14 +241,12 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: shl v2.8h, v2.8h, #15
-; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0
; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
- %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask, i32 8)
+ %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask)
ret <8 x i1> %r
}
More information about the llvm-commits
mailing list