[llvm] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (PR #101976)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 2 02:20:16 PDT 2024
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/101976
>From ba6e9b594549ce7972f63af1ba8d8b434641cdf3 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 19 Jul 2024 16:10:51 +0100
Subject: [PATCH 1/5] [AArch64] Add @llvm.experimental.vector.match
This patch introduces an experimental intrinsic for matching the
elements of one vector against the elements of another.
For AArch64 targets that support SVE2, it lowers to a MATCH instruction
for supported fixed and scalar types.
---
llvm/docs/LangRef.rst | 45 +++++++++++++++
.../llvm/Analysis/TargetTransformInfo.h | 9 +++
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
llvm/include/llvm/IR/Intrinsics.td | 10 ++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
.../SelectionDAG/SelectionDAGBuilder.cpp | 9 +++
.../Target/AArch64/AArch64ISelLowering.cpp | 46 +++++++++++++++
.../AArch64/AArch64TargetTransformInfo.cpp | 12 ++++
.../AArch64/AArch64TargetTransformInfo.h | 2 +
.../AArch64/intrinsic-vector-match-sve2.ll | 57 +++++++++++++++++++
10 files changed, 197 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b17e3c828ed3d5..dd9851d1af0783 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19637,6 +19637,51 @@ are undefined.
}
+'``llvm.experimental.vector.match.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. Support for specific vector types is target
+dependent.
+
+::
+
+ declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<n> x <ty>> %op2, <<n> x i1> %mask, i32 <segsize>)
+ declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <vscale x <n> x <ty>> %op2, <vscale x <n> x i1> %mask, i32 <segsize>)
+
+Overview:
+"""""""""
+
+Find elements of the first argument matching any elements of the second.
+
+Arguments:
+""""""""""
+
+The first argument is the search vector, the second argument is the vector of
+elements we are searching for (i.e. for which we consider a match successful),
+and the third argument is a mask that controls which elements of the first
+argument are active. The fourth argument is an immediate that sets the segment
+size for the search window.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.match``' intrinsic compares each element in the
+first argument against potentially several elements of the second, placing
+``1`` in the corresponding element of the output vector if any comparison is
+successful, and ``0`` otherwise. Inactive elements in the mask are set to ``0``
+in the output. The segment size controls the number of elements of the second
+argument that are compared against.
+
+For example, for vectors with 16 elements, if ``segsize = 16`` then each
+element of the first argument is compared against all 16 elements of the second
+argument; but if ``segsize = 4``, then each of the first four elements of the
+first argument is compared against the first four elements of the second
+argument, each of the second four elements of the first argument is compared
+against the second four elements of the second argument, and so forth.
+
Matrix Intrinsics
-----------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 38e8b9da213974..786c13a177ccf5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1746,6 +1746,10 @@ class TargetTransformInfo {
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const;
+ /// \returns Returns true if the target supports vector match operations for
+ /// the vector type `VT` using a segment size of `SegSize`.
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
struct VPLegalization {
enum VPTransform {
// keep the predicating parameter
@@ -2184,6 +2188,7 @@ class TargetTransformInfo::Concept {
virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const = 0;
+ virtual bool hasVectorMatch(VectorType *VT, unsigned SegSize) const = 0;
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2952,6 +2957,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
}
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const override {
+ return Impl.hasVectorMatch(VT, SegSize);
+ }
+
VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d208a710bb27fd..36621861ab8c80 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -958,6 +958,8 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }
+
TargetTransformInfo::VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const {
return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index b4e758136b39fb..f6d77aa596f601 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1892,6 +1892,16 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
[ IntrArgMemOnly ]>;
+// Experimental match
+def int_experimental_vector_match : DefaultAttrsIntrinsic<
+ [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+ [ llvm_anyvector_ty,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Mask
+ llvm_i32_ty ], // Segment size
+ [ IntrNoMem, IntrNoSync, IntrWillReturn,
+ ImmArg<ArgIndex<3>> ]>;
+
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
// Integer arithmetic
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index dcde78925bfa98..d8314af0537fe5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1352,6 +1352,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}
+bool TargetTransformInfo::hasVectorMatch(VectorType *VT,
+ unsigned SegSize) const {
+ return TTIImpl->hasVectorMatch(VT, SegSize);
+}
+
TargetTransformInfo::Concept::~Concept() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9d617c7acd13c2..9cb7d65975b9f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8096,6 +8096,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
return;
}
+ case Intrinsic::experimental_vector_match: {
+ auto *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
+ auto SegmentSize = cast<ConstantInt>(I.getOperand(3))->getLimitedValue();
+ const auto &TTI =
+ TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
+ assert(VT && TTI.hasVectorMatch(VT, SegmentSize) && "Unsupported type!");
+ visitTargetIntrinsic(I, Intrinsic);
+ return;
+ }
case Intrinsic::vector_reverse:
visitVectorReverse(I);
return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7704321a0fc3ac..050807142fc0ac 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6106,6 +6106,51 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
}
+ case Intrinsic::experimental_vector_match: {
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
+
+ auto Op1 = Op.getOperand(1);
+ auto Op2 = Op.getOperand(2);
+ auto Mask = Op.getOperand(3);
+ auto SegmentSize =
+ cast<ConstantSDNode>(Op.getOperand(4))->getLimitedValue();
+
+ EVT VT = Op.getValueType();
+ auto MinNumElts = VT.getVectorMinNumElements();
+
+ assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
+ assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
+ "Custom lower only works on 128-bit segments.");
+ assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
+ Op1.getValueType().getVectorElementType() == MVT::i16) &&
+ "Custom lower only supports 8-bit or 16-bit characters.");
+ assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
+ "match minimum number of elements.");
+
+ if (VT.isScalableVector())
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Mask, Op1, Op2);
+
+ // We can use the SVE2 match instruction to lower this intrinsic by
+ // converting the operands to scalable vectors, doing a match, and then
+ // extracting a fixed-width subvector from the scalable vector.
+
+ EVT OpVT = Op1.getValueType();
+ EVT OpContainerVT = getContainerForFixedLengthVector(DAG, OpVT);
+ EVT MatchVT = OpContainerVT.changeElementType(MVT::i1);
+
+ auto ScalableOp1 = convertToScalableVector(DAG, OpContainerVT, Op1);
+ auto ScalableOp2 = convertToScalableVector(DAG, OpContainerVT, Op2);
+ auto ScalableMask = DAG.getNode(ISD::SIGN_EXTEND, dl, OpVT, Mask);
+ ScalableMask = convertFixedMaskToScalableVector(ScalableMask, DAG);
+
+ SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID,
+ ScalableMask, ScalableOp1, ScalableOp2);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match),
+ DAG.getVectorIdxConstant(0, dl));
+ }
}
}
@@ -26544,6 +26589,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
return;
}
+ case Intrinsic::experimental_vector_match:
case Intrinsic::get_active_lane_mask: {
if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
return;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8f19fa87e2aba..806dc856c58626 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3835,6 +3835,18 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
}
}
+bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
+ // Check that the target has SVE2 (and SVE is available), that `VT' is a
+ // legal type for MATCH, and that the segment size is 128-bit.
+ if (ST->hasSVE2() && ST->isSVEAvailable() &&
+ VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
+ VT->getElementCount().getKnownMinValue() == SegSize &&
+ (VT->getElementCount().getKnownMinValue() == 8 ||
+ VT->getElementCount().getKnownMinValue() == 16))
+ return true;
+ return false;
+}
+
InstructionCost
AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a9189fd53f40bb..6ad21a9e0a77a9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->hasSVE();
}
+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind);
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
new file mode 100644
index 00000000000000..0df92dfa800006
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define <vscale x 16 x i1> @match_nxv16i8(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask, i32 16)
+ ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 8 x i1> @match_nxv8i16(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: ret
+ %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask, i32 8)
+ ret <vscale x 8 x i1> %r
+}
+
+define <16 x i1> @match_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v2.16b, v2.16b, #7
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
+; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
+; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask, i32 16)
+ ret <16 x i1> %r
+}
+
+define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
+; CHECK-LABEL: match_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: shl v2.8h, v2.8h, #15
+; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0
+; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: ret
+ %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask, i32 8)
+ ret <8 x i1> %r
+}
+
+attributes #0 = { "target-features"="+sve2" }
>From a6e26ffeb9770df51b0338fb151d9b314b192343 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 15 Jul 2024 17:57:30 +0100
Subject: [PATCH 2/5] [AArch64] Add MATCH loops to LoopIdiomVectorizePass
This patch adds a new loop to LoopIdiomVectorizePass, enabling it to
recognise and vectorise loops such as:
template<class InputIt, class ForwardIt>
InputIt find_first_of(InputIt first, InputIt last,
ForwardIt s_first, ForwardIt s_last)
{
for (; first != last; ++first)
for (ForwardIt it = s_first; it != s_last; ++it)
if (*first == *it)
return first;
return last;
}
These loops match the C++ standard library's std::find_first_of.
The loops are vectorised using @experimental.vector.match which is added
separately.
---
.../Vectorize/LoopIdiomVectorize.cpp | 442 +++++++++++++++++-
llvm/test/CodeGen/AArch64/find-first-byte.ll | 120 +++++
2 files changed, 561 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AArch64/find-first-byte.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index cb31e2a2ecaec4..a9683f08c5ab9d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -79,6 +79,12 @@ static cl::opt<unsigned>
cl::desc("The vectorization factor for byte-compare patterns."),
cl::init(16));
+static cl::opt<bool>
+ DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
+ cl::Hidden, cl::init(false),
+ cl::desc("Proceed with Loop Idiom Vectorize Pass, but "
+ "do not convert find-first-byte loop(s)."));
+
static cl::opt<bool>
VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
@@ -136,6 +142,21 @@ class LoopIdiomVectorize {
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
Value *Start, bool IncIdx, BasicBlock *FoundBB,
BasicBlock *EndBB);
+
+ bool recognizeFindFirstByte();
+
+ Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+ unsigned VF, unsigned CharWidth,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB);
+
+ void transformFindFirstByte(PHINode *IndPhi, unsigned VF, unsigned CharWidth,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB);
/// @}
};
} // anonymous namespace
@@ -190,7 +211,13 @@ bool LoopIdiomVectorize::run(Loop *L) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
<< CurLoop->getHeader()->getName() << "\n");
- return recognizeByteCompare();
+ if (recognizeByteCompare())
+ return true;
+
+ if (recognizeFindFirstByte())
+ return true;
+
+ return false;
}
bool LoopIdiomVectorize::recognizeByteCompare() {
@@ -941,3 +968,416 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
report_fatal_error("Loops must remain in LCSSA form!");
}
}
+
+bool LoopIdiomVectorize::recognizeFindFirstByte() {
+ // Currently the transformation only works on scalable vector types, although
+ // there is no fundamental reason why it cannot be made to work for fixed
+ // width too.
+ if (!TTI->supportsScalableVectors() || DisableFindFirstByte)
+ return false;
+
+ // Define some constants we need throughout.
+ // TODO: Some of these could be made configurable parameters. For example, we
+ // could allow CharWidth = 16 (and VF = 8).
+ unsigned VF = 16;
+ unsigned CharWidth = 8;
+ BasicBlock *Header = CurLoop->getHeader();
+ LLVMContext &Ctx = Header->getContext();
+ auto *CharTy = Type::getIntNTy(Ctx, CharWidth);
+ auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+
+ // Check if the target supports efficient vector matches for vectors of
+ // bytes.
+ if (!TTI->hasVectorMatch(CharVTy, VF))
+ return false;
+
+ // In LoopIdiomVectorize::run we have already checked that the loop has a
+ // preheader so we can assume it's in a canonical form.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4)
+ return false;
+
+ // We expect this loop to have one nested loop.
+ if (CurLoop->getSubLoops().size() != 1)
+ return false;
+
+ auto *InnerLoop = CurLoop->getSubLoops().front();
+ PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
+
+ if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
+ return false;
+
+ auto LoopBlocks = CurLoop->getBlocks();
+ // We are expecting the following blocks below. For now, we will bail out for
+ // anything deviating from this.
+ //
+ // .preheader: ; preds = %.preheader.preheader, %23
+ // %14 = phi ptr [ %24, %23 ], [ %3, %.preheader.preheader ]
+ // %15 = load i8, ptr %14, align 1, !tbaa !14
+ // br label %19
+ //
+ // 19: ; preds = %16, %.preheader
+ // %20 = phi ptr [ %7, %.preheader ], [ %17, %16 ]
+ // %21 = load i8, ptr %20, align 1, !tbaa !14
+ // %22 = icmp eq i8 %15, %21
+ // br i1 %22, label %.loopexit.loopexit, label %16
+ //
+ // 16: ; preds = %19
+ // %17 = getelementptr inbounds i8, ptr %20, i64 1
+ // %18 = icmp eq ptr %17, %10
+ // br i1 %18, label %23, label %19, !llvm.loop !15
+ //
+ // 23: ; preds = %16
+ // %24 = getelementptr inbounds i8, ptr %14, i64 1
+ // %25 = icmp eq ptr %24, %6
+ // br i1 %25, label %.loopexit.loopexit5, label %.preheader, !llvm.loop !17
+ //
+ if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[1]->sizeWithoutDebug() > 4 ||
+ LoopBlocks[2]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[3]->sizeWithoutDebug() > 3)
+ return false;
+
+ // If we match the pattern, IndPhi is going to be replaced. We cannot replace
+ // the loop if any other of its instructions are used outside of it.
+ for (BasicBlock *BB : LoopBlocks)
+ for (Instruction &I : *BB)
+ if (&I != IndPhi)
+ for (User *U : I.users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ return false;
+
+ // Match the branch instruction for the header. We are expecting an
+ // unconditional branch to the inner loop.
+ BasicBlock *MatchBB;
+ if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
+ !InnerLoop->contains(MatchBB))
+ return false;
+
+ // MatchBB should be the entrypoint into the inner loop containing the
+ // comparison between a search item and a valid/successful match.
+ ICmpInst::Predicate MatchPred;
+ BasicBlock *ExitSucc;
+ BasicBlock *InnerBB;
+ Value *LoadA, *LoadB;
+ if (!match(MatchBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)),
+ m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+ !InnerLoop->contains(InnerBB))
+ return false;
+
+ // We expect a single use of IndPhi outside of CurLoop. The outside use
+ // should be a PHINode in ExitSucc coming from MatchBB.
+ // Note: Strictly speaking we are not checking for a *single* use of IndPhi
+ // outside of CurLoop here, but below we check that we only exit CurLoop to
+ // ExitSucc in one place, so by construction this should be true. Besides, in
+ // the event it is not, as long as the use is a PHINode in ExitSucc and comes
+ // from MatchBB, the transformation should still be valid in any case.
+ for (Use &U : IndPhi->uses())
+ if (CurLoop->contains(cast<Instruction>(U.getUser())))
+ continue;
+ else if (auto *PN = dyn_cast<PHINode>(U.getUser());
+ !PN || PN->getParent() != ExitSucc ||
+ PN->getIncomingBlock(U) != MatchBB)
+ return false;
+
+ // Match the loads.
+ Value *A, *B;
+ if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
+ return false;
+
+ // Make sure they are simple.
+ LoadInst *LoadAI = cast<LoadInst>(LoadA);
+ LoadInst *LoadBI = cast<LoadInst>(LoadB);
+ if (!LoadAI->isSimple() || !LoadBI->isSimple())
+ return false;
+
+ // The values loaded come from two PHIs that can only have two incoming
+ // values.
+ PHINode *PNA = dyn_cast<PHINode>(A);
+ PHINode *PNB = dyn_cast<PHINode>(B);
+ if (!PNA || PNA->getNumIncomingValues() != 2 ||
+ !PNB || PNB->getNumIncomingValues() != 2)
+ return false;
+
+ // One PHI comes from the outer loop, the other one from the inner loop.
+ // CurLoop contains PNA, InnerLoop PNB.
+ if (InnerLoop->contains(PNA))
+ std::swap(PNA, PNB);
+ if (PNA != &Header->front() || PNB != &MatchBB->front())
+ return false;
+
+ // The incoming values of both PHI nodes should be a gep of 1.
+ Value *StartA = PNA->getIncomingValue(0);
+ Value *IndexA = PNA->getIncomingValue(1);
+ if (CurLoop->contains(PNA->getIncomingBlock(0)))
+ std::swap(StartA, IndexA);
+
+ Value *StartB = PNB->getIncomingValue(0);
+ Value *IndexB = PNB->getIncomingValue(1);
+ if (InnerLoop->contains(PNB->getIncomingBlock(0)))
+ std::swap(StartB, IndexB);
+
+ // Match the GEPs.
+ if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) ||
+ !match(IndexB, m_GEP(m_Specific(PNB), m_One())))
+ return false;
+
+ GetElementPtrInst *GEPA = cast<GetElementPtrInst>(IndexA);
+ GetElementPtrInst *GEPB = cast<GetElementPtrInst>(IndexB);
+
+ // Check we are loading CharTy values.
+ if (!GEPA->getResultElementType()->isIntegerTy(CharWidth) ||
+ !GEPB->getResultElementType()->isIntegerTy(CharWidth) ||
+ !LoadAI->getType()->isIntegerTy(CharWidth) ||
+ !LoadBI->getType()->isIntegerTy(CharWidth))
+ return false;
+
+ // InnerBB should increment the address of the key we are checking.
+ BasicBlock *OuterBB;
+ Value *EndB;
+ if (!match(InnerBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)),
+ m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+ !CurLoop->contains(OuterBB))
+ return false;
+
+ // OuterBB should increment the address of the element we are looking for.
+ Value *EndA;
+ BasicBlock *ExitFail;
+ if (!match(OuterBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)),
+ m_BasicBlock(ExitFail), m_Specific(Header))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
+ << *CurLoop << "\n\n");
+
+ transformFindFirstByte(IndPhi, VF, CharWidth, ExitSucc, ExitFail, GEPA, GEPB,
+ StartA, EndA, StartB, EndB);
+ return true;
+}
+
+Value *LoopIdiomVectorize::expandFindFirstByte(
+ IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, unsigned CharWidth,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
+ // Set up some types and constants that we intend to reuse.
+ auto *I64Ty = Builder.getInt64Ty();
+ auto *I32Ty = Builder.getInt32Ty();
+ auto *PtrTy = Builder.getPtrTy();
+ auto *CharTy = Builder.getIntNTy(CharWidth);
+ auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
+ auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+
+ // Other common arguments.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ LLVMContext &Ctx = PHBranch->getContext();
+ Value *Passthru = ConstantInt::getNullValue(CharVTy);
+
+ // Split block in the original loop preheader.
+ BasicBlock *OldPH = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "oldph");
+
+ // Create the blocks that we're going to need. We separate them among outer
+ // (OL) and inner (IL) loops with functions similar to those in the original
+ // loops.
+ // 1. Check that we have at least one element to load. (OL)
+ // 2. Set up masks and load a vector of elements. (OL)
+ // 3. Check that we have at least one key to match against. (IL)
+ // 4. Check whether we can load a full register of keys. (IL)
+ // 5. If so, load it. (IL)
+ // 6. If not, set up a new mask, load the keys possible, and splat the
+ // first one to the remainder of the register. (IL)
+ // 7. Carry out the match test; if successful go to (8), otherwise loop
+ // back to (3). (IL)
+ // 8. Figure out the index of the match.
+ // Note that only block (8) is *not* part of a loop (inner or outer).
+
+ BasicBlock *BB1 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB2 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB3 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB4 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB5 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB6 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB7 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ BasicBlock *BB8 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+
+ // Update LoopInfo with the new loops.
+ auto OuterLoop = LI->AllocateLoop();
+ auto InnerLoop = LI->AllocateLoop();
+
+ if (CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->addBasicBlockToLoop(BB8, *LI);
+ CurLoop->getParentLoop()->addChildLoop(OuterLoop);
+ } else {
+ LI->addTopLevelLoop(OuterLoop);
+ }
+
+ // Add the inner loop to the outer.
+ OuterLoop->addChildLoop(InnerLoop);
+
+ // Add the new basic blocks to the corresponding loops.
+ OuterLoop->addBasicBlockToLoop(BB1, *LI);
+ OuterLoop->addBasicBlockToLoop(BB2, *LI);
+ InnerLoop->addBasicBlockToLoop(BB3, *LI);
+ InnerLoop->addBasicBlockToLoop(BB4, *LI);
+ InnerLoop->addBasicBlockToLoop(BB5, *LI);
+ InnerLoop->addBasicBlockToLoop(BB6, *LI);
+ InnerLoop->addBasicBlockToLoop(BB7, *LI);
+
+ // Update the terminator added by SplitBlock to branch to the first block
+ Preheader->getTerminator()->setSuccessor(0, BB1);
+ DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1},
+ {DominatorTree::Delete, Preheader, OldPH}});
+
+ // (1) Check the outer loop iteration.
+ Builder.SetInsertPoint(BB1);
+ PHINode *PNA = Builder.CreatePHI(PtrTy, 2, "pna");
+ Value *CheckA = Builder.CreateICmpULT(PNA, EndA);
+ Builder.CreateCondBr(CheckA, BB2, ExitFail);
+ DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2},
+ {DominatorTree::Insert, BB1, ExitFail}});
+
+ // (2) Outer loop body.
+ Builder.SetInsertPoint(BB2);
+ Value *IncA = Builder.CreateGEP(CharTy, PNA, ConstantInt::get(I64Ty, VF), "",
+ GEPA->isInBounds());
+ Value *CheckIncA = Builder.CreateICmpUGT(IncA, EndA);
+ Value *SelA = Builder.CreateSelect(CheckIncA, EndA, IncA);
+ Value *PredA = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePointerCast(PNA, I64Ty),
+ Builder.CreatePointerCast(SelA, I64Ty)});
+ Value *LoadA =
+ Builder.CreateMaskedLoad(CharVTy, PNA, Align(1), PredA, Passthru);
+ Value *PredBInit = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {ConstantInt::get(I64Ty, 0), ConstantInt::get(I64Ty, VF)});
+ Builder.CreateBr(BB3);
+ DTU.applyUpdates({{DominatorTree::Insert, BB2, BB3}});
+
+ // (3) Check the inner loop iteration.
+ Builder.SetInsertPoint(BB3);
+ PHINode *PNB = Builder.CreatePHI(PtrTy, 2, "pnb");
+ PHINode *PredBFull = Builder.CreatePHI(PredVTy, 2);
+ Value *CheckB = Builder.CreateICmpULT(PNB, EndB);
+ Builder.CreateCondBr(CheckB, BB4, BB1);
+ DTU.applyUpdates({{DominatorTree::Insert, BB3, BB4},
+ {DominatorTree::Insert, BB3, BB1}});
+
+ // (4) Check load B.
+ Builder.SetInsertPoint(BB4);
+ Value *IncB = Builder.CreateGEP(CharTy, PNB, ConstantInt::get(I64Ty, VF), "",
+ GEPB->isInBounds());
+ Value *IfNotFullB = Builder.CreateICmpUGT(IncB, EndB);
+ Builder.CreateCondBr(IfNotFullB, BB6, BB5);
+ DTU.applyUpdates({{DominatorTree::Insert, BB4, BB6},
+ {DominatorTree::Insert, BB4, BB5}});
+
+ // (5) Full load B.
+ Builder.SetInsertPoint(BB5);
+ Value *LoadBFull =
+ Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBFull, Passthru);
+ Builder.CreateBr(BB7);
+ DTU.applyUpdates({{DominatorTree::Insert, BB5, BB7}});
+
+ // (6) Partial load B.
+ Builder.SetInsertPoint(BB6);
+ Value *PredBPart = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePointerCast(PNB, I64Ty),
+ Builder.CreatePointerCast(EndB, I64Ty)});
+ Value *LoadBPart =
+ Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBPart, Passthru);
+ Value *LoadB0 = Builder.CreateExtractElement(LoadBPart, uint64_t(0));
+ Value *LoadBSplat =
+ Builder.CreateVectorSplat(PredVTy->getElementCount(), LoadB0);
+ LoadBPart = Builder.CreateSelect(PredBPart, LoadBPart, LoadBSplat);
+ Builder.CreateBr(BB7);
+ DTU.applyUpdates({{DominatorTree::Insert, BB6, BB7}});
+
+ // (7) Carry out match.
+ Builder.SetInsertPoint(BB7);
+ PHINode *PredBNext = Builder.CreatePHI(PredVTy, 2);
+ PHINode *LoadB = Builder.CreatePHI(CharVTy, 2);
+ Value *MatchPred = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_match, {CharVTy},
+ {LoadA, LoadB, PredA, ConstantInt::get(I32Ty, VF)});
+ Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+ Builder.CreateCondBr(IfAnyMatch, BB8, BB3);
+ DTU.applyUpdates({{DominatorTree::Insert, BB7, BB8},
+ {DominatorTree::Insert, BB7, BB3}});
+
+ // (8) Match success.
+ Builder.SetInsertPoint(BB8);
+ Value *MatchCnt = Builder.CreateIntrinsic(
+ Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
+ {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
+ Value *MatchVal = Builder.CreateGEP(CharTy, PNA, MatchCnt);
+ Builder.CreateBr(ExitSucc);
+ DTU.applyUpdates({{DominatorTree::Insert, BB8, ExitSucc}});
+
+ // Set incoming values for PHIs.
+ PNA->addIncoming(StartA, Preheader);
+ PNA->addIncoming(IncA, BB3);
+
+ PNB->addIncoming(StartB, BB2);
+ PNB->addIncoming(IncB, BB7);
+ PredBFull->addIncoming(PredBInit, BB2);
+ PredBFull->addIncoming(PredBNext, BB7);
+
+ PredBNext->addIncoming(PredBFull, BB5);
+ PredBNext->addIncoming(PredBPart, BB6);
+ LoadB->addIncoming(LoadBFull, BB5);
+ LoadB->addIncoming(LoadBPart, BB6);
+
+ if (VerifyLoops) {
+ OuterLoop->verifyLoop();
+ InnerLoop->verifyLoop();
+ if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ if (!InnerLoop->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+
+ assert(OldPH->hasNPredecessors(0) && "Expected old loop to be unreachable.");
+
+ return MatchVal;
+}
+
+void LoopIdiomVectorize::transformFindFirstByte(
+ PHINode *IndPhi, unsigned VF, unsigned CharWidth,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
+ // Insert the find first byte code at the end of the preheader block.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ IRBuilder<> Builder(PHBranch);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+ Value *MatchVal =
+ expandFindFirstByte(Builder, DTU, VF, CharWidth, ExitSucc, ExitFail,
+ GEPA, GEPB, StartA, EndA, StartB, EndB);
+
+ assert(PHBranch->isUnconditional() &&
+ "Expected preheader to terminate with an unconditional branch.");
+
+ // Add new incoming values with the result of the transformation to the old
+ // uses of IndPhi in ExitSucc.
+ for (auto &PN : ExitSucc->phis())
+ for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+ if (PN.getIncomingValue(i) == IndPhi)
+ PN.addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+
+ // Maybe EliminateUnreachableBlocks ? I've left them for now because we may
+ // want to reuse them to implement an alternative path for small arrays, for
+ // example.
+
+ //dbgs() << *Preheader->getParent() << "\n";
+}
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
new file mode 100644
index 00000000000000..4bf3df1d4817b6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mattr=+sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize),simplifycfg' -S < %s | FileCheck -check-prefix=SVE2 %s
+; RUN: opt -mattr=-sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize),simplifycfg' -S < %s | FileCheck -check-prefix=NOSVE2 %s
+
+define dso_local noundef ptr @first_byte_of(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; SVE2-LABEL: define dso_local noundef ptr @first_byte_of(
+; SVE2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE2-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; SVE2-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; SVE2-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; SVE2-NEXT: br i1 [[TMP7]], [[DOTLOOPEXIT1:label %.*]], label %[[DOTPREHEADER:.*]]
+; SVE2: [[_PREHEADER:.*:]]
+; SVE2-NEXT: [[PNA:%.*]] = phi ptr [ [[TMP10:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], [[TMP4:%.*]] ]
+; SVE2-NEXT: [[TMP8:%.*]] = icmp ult ptr [[PNA]], [[TMP1]]
+; SVE2-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], [[DOTLOOPEXIT1]]
+; SVE2: [[BB9]]:
+; SVE2-NEXT: [[TMP10]] = getelementptr inbounds i8, ptr [[PNA]], i64 16
+; SVE2-NEXT: [[TMP11:%.*]] = icmp ugt ptr [[TMP10]], [[TMP1]]
+; SVE2-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], ptr [[TMP1]], ptr [[TMP10]]
+; SVE2-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[PNA]] to i64
+; SVE2-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP12]] to i64
+; SVE2-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP13]], i64 [[TMP14]])
+; SVE2-NEXT: [[TMP16:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNA]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; SVE2-NEXT: br label %[[TMP18]]
+; SVE2: [[TMP18]]:
+; SVE2-NEXT: [[PNB:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP22:%.*]], %[[TMP33:.*]] ]
+; SVE2-NEXT: [[TMP19:%.*]] = phi <vscale x 16 x i1> [ [[TMP17]], %[[BB9]] ], [ [[TMP34:%.*]], %[[TMP33]] ]
+; SVE2-NEXT: [[TMP20:%.*]] = icmp ult ptr [[PNB]], [[TMP3]]
+; SVE2-NEXT: br i1 [[TMP20]], label %[[BB21:.*]], label %[[DOTPREHEADER]]
+; SVE2: [[BB21]]:
+; SVE2-NEXT: [[TMP22]] = getelementptr inbounds i8, ptr [[PNB]], i64 16
+; SVE2-NEXT: [[TMP23:%.*]] = icmp ugt ptr [[TMP22]], [[TMP3]]
+; SVE2-NEXT: br i1 [[TMP23]], label %[[BB26:.*]], label %[[BB24:.*]]
+; SVE2: [[BB24]]:
+; SVE2-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNB]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT: br label %[[TMP33]]
+; SVE2: [[BB26]]:
+; SVE2-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[PNB]] to i64
+; SVE2-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; SVE2-NEXT: [[TMP29:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP28]])
+; SVE2-NEXT: [[TMP30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNB]], i32 1, <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i8> [[TMP30]], i64 0
+; SVE2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP31]], i64 0
+; SVE2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; SVE2-NEXT: [[TMP32:%.*]] = select <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> [[TMP30]], <vscale x 16 x i8> [[DOTSPLAT]]
+; SVE2-NEXT: br label %[[TMP33]]
+; SVE2: [[TMP33]]:
+; SVE2-NEXT: [[TMP34]] = phi <vscale x 16 x i1> [ [[TMP19]], %[[BB24]] ], [ [[TMP29]], %[[BB26]] ]
+; SVE2-NEXT: [[TMP35:%.*]] = phi <vscale x 16 x i8> [ [[TMP25]], %[[BB24]] ], [ [[TMP32]], %[[BB26]] ]
+; SVE2-NEXT: [[TMP36:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP35]], <vscale x 16 x i1> [[TMP15]], i32 16)
+; SVE2-NEXT: [[TMP37:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP36]])
+; SVE2-NEXT: br i1 [[TMP37]], label %[[DOTLOOPEXIT:.*]], label %[[TMP18]]
+; SVE2: [[_LOOPEXIT:.*:]]
+; SVE2-NEXT: [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP36]], i1 true)
+; SVE2-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[PNA]], i64 [[TMP38]]
+; SVE2-NEXT: br [[DOTLOOPEXIT1]]
+; SVE2: [[_LOOPEXIT1:.*:]]
+; SVE2-NEXT: [[TMP40:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP39]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTPREHEADER]] ]
+; SVE2-NEXT: ret ptr [[TMP40]]
+;
+; NOSVE2-LABEL: define dso_local noundef ptr @first_byte_of(
+; NOSVE2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOSVE2-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; NOSVE2-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; NOSVE2-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; NOSVE2-NEXT: br i1 [[TMP7]], [[DOTLOOPEXIT:label %.*]], label %[[DOTPREHEADER:.*]]
+; NOSVE2: [[_LOOPEXIT:.*:]]
+; NOSVE2-NEXT: [[TMP8:%.*]] = phi ptr [ [[TMP18:%.*]], %[[TMP17:.*]] ], [ [[TMP0]], [[TMP4:%.*]] ]
+; NOSVE2-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
+; NOSVE2-NEXT: br label %[[BB13:.*]]
+; NOSVE2: [[BB10:.*]]:
+; NOSVE2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP14:%.*]], i64 1
+; NOSVE2-NEXT: [[TMP12:%.*]] = icmp eq ptr [[TMP11]], [[TMP3]]
+; NOSVE2-NEXT: br i1 [[TMP12]], label %[[TMP17]], label %[[BB13]]
+; NOSVE2: [[BB13]]:
+; NOSVE2-NEXT: [[TMP14]] = phi ptr [ [[TMP2]], %[[DOTPREHEADER]] ], [ [[TMP11]], %[[BB10]] ]
+; NOSVE2-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; NOSVE2-NEXT: [[TMP16:%.*]] = icmp eq i8 [[TMP9]], [[TMP15]]
+; NOSVE2-NEXT: br i1 [[TMP16]], [[DOTLOOPEXIT]], label %[[BB10]]
+; NOSVE2: [[TMP17]]:
+; NOSVE2-NEXT: [[TMP18]] = getelementptr inbounds i8, ptr [[TMP8]], i64 1
+; NOSVE2-NEXT: [[TMP19:%.*]] = icmp eq ptr [[TMP18]], [[TMP1]]
+; NOSVE2-NEXT: br i1 [[TMP19]], [[DOTLOOPEXIT]], label %[[DOTPREHEADER]]
+; NOSVE2: [[_LOOPEXIT2:.*:]]
+; NOSVE2-NEXT: [[TMP40:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP8]], %[[BB13]] ], [ [[TMP1]], %[[TMP17]] ]
+; NOSVE2-NEXT: ret ptr [[TMP40]]
+;
+ %5 = icmp eq ptr %0, %1
+ %6 = icmp eq ptr %2, %3
+ %7 = or i1 %5, %6
+ br i1 %7, label %21, label %8
+
+8:
+ %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+ %10 = load i8, ptr %9, align 1
+ br label %14
+
+11:
+ %12 = getelementptr inbounds i8, ptr %15, i64 1
+ %13 = icmp eq ptr %12, %3
+ br i1 %13, label %18, label %14
+
+14:
+ %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+ %16 = load i8, ptr %15, align 1
+ %17 = icmp eq i8 %10, %16
+ br i1 %17, label %21, label %11
+
+18:
+ %19 = getelementptr inbounds i8, ptr %9, i64 1
+ %20 = icmp eq ptr %19, %1
+ br i1 %20, label %21, label %8
+
+21:
+ %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+ ret ptr %22
+}
+
+attributes #0 = { "target-features"="+sve2" }
>From 40c35030d2b8385dfea66d0c36d39429e6a457d3 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 5 Aug 2024 14:43:54 +0100
Subject: [PATCH 3/5] Fix format
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 050807142fc0ac..c7e1015a714c8d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6122,7 +6122,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
"Custom lower only works on 128-bit segments.");
- assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
+ assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
Op1.getValueType().getVectorElementType() == MVT::i16) &&
"Custom lower only supports 8-bit or 16-bit characters.");
assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 806dc856c58626..949470678e58c3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3841,7 +3841,7 @@ bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
if (ST->hasSVE2() && ST->isSVEAvailable() &&
VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
VT->getElementCount().getKnownMinValue() == SegSize &&
- (VT->getElementCount().getKnownMinValue() == 8 ||
+ (VT->getElementCount().getKnownMinValue() == 8 ||
VT->getElementCount().getKnownMinValue() == 16))
return true;
return false;
>From 30b1ff59d2a85ff737b5be84d9670f55ca95d5a6 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 5 Aug 2024 14:56:46 +0100
Subject: [PATCH 4/5] Fix format
---
.../Vectorize/LoopIdiomVectorize.cpp | 71 +++++++++----------
1 file changed, 33 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index a9683f08c5ab9d..9195f95b84451f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -149,14 +149,14 @@ class LoopIdiomVectorize {
unsigned VF, unsigned CharWidth,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
- Value *StartA, Value *EndA,
- Value *StartB, Value *EndB);
+ Value *StartA, Value *EndA, Value *StartB,
+ Value *EndB);
void transformFindFirstByte(PHINode *IndPhi, unsigned VF, unsigned CharWidth,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
- Value *StartA, Value *EndA,
- Value *StartB, Value *EndB);
+ Value *StartA, Value *EndA, Value *StartB,
+ Value *EndB);
/// @}
};
} // anonymous namespace
@@ -1010,23 +1010,23 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
// We are expecting the following blocks below. For now, we will bail out for
// anything deviating from this.
//
- // .preheader: ; preds = %.preheader.preheader, %23
+ // .preheader:
// %14 = phi ptr [ %24, %23 ], [ %3, %.preheader.preheader ]
// %15 = load i8, ptr %14, align 1, !tbaa !14
// br label %19
//
- // 19: ; preds = %16, %.preheader
+ // 19:
// %20 = phi ptr [ %7, %.preheader ], [ %17, %16 ]
// %21 = load i8, ptr %20, align 1, !tbaa !14
// %22 = icmp eq i8 %15, %21
// br i1 %22, label %.loopexit.loopexit, label %16
//
- // 16: ; preds = %19
+ // 16:
// %17 = getelementptr inbounds i8, ptr %20, i64 1
// %18 = icmp eq ptr %17, %10
// br i1 %18, label %23, label %19, !llvm.loop !15
//
- // 23: ; preds = %16
+ // 23:
// %24 = getelementptr inbounds i8, ptr %14, i64 1
// %25 = icmp eq ptr %24, %6
// br i1 %25, label %.loopexit.loopexit5, label %.preheader, !llvm.loop !17
@@ -1096,8 +1096,8 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
// values.
PHINode *PNA = dyn_cast<PHINode>(A);
PHINode *PNB = dyn_cast<PHINode>(B);
- if (!PNA || PNA->getNumIncomingValues() != 2 ||
- !PNB || PNB->getNumIncomingValues() != 2)
+ if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB ||
+ PNB->getNumIncomingValues() != 2)
return false;
// One PHI comes from the outer loop, the other one from the inner loop.
@@ -1139,8 +1139,7 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
if (!match(InnerBB->getTerminator(),
m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)),
m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
- MatchPred != ICmpInst::Predicate::ICMP_EQ ||
- !CurLoop->contains(OuterBB))
+ MatchPred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(OuterBB))
return false;
// OuterBB should increment the address of the element we are looking for.
@@ -1152,8 +1151,7 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
MatchPred != ICmpInst::Predicate::ICMP_EQ)
return false;
- LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
- << *CurLoop << "\n\n");
+ LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" << *CurLoop << "\n\n");
transformFindFirstByte(IndPhi, VF, CharWidth, ExitSucc, ExitFail, GEPA, GEPB,
StartA, EndA, StartB, EndB);
@@ -1162,9 +1160,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
Value *LoopIdiomVectorize::expandFindFirstByte(
IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, unsigned CharWidth,
- BasicBlock *ExitSucc, BasicBlock *ExitFail,
- GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
- Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
+ BasicBlock *ExitSucc, BasicBlock *ExitFail, GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Value *StartA, Value *EndA, Value *StartB,
+ Value *EndB) {
// Set up some types and constants that we intend to reuse.
auto *I64Ty = Builder.getInt64Ty();
auto *I32Ty = Builder.getInt32Ty();
@@ -1248,10 +1246,10 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
GEPA->isInBounds());
Value *CheckIncA = Builder.CreateICmpUGT(IncA, EndA);
Value *SelA = Builder.CreateSelect(CheckIncA, EndA, IncA);
- Value *PredA = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePointerCast(PNA, I64Ty),
- Builder.CreatePointerCast(SelA, I64Ty)});
+ Value *PredA =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePointerCast(PNA, I64Ty),
+ Builder.CreatePointerCast(SelA, I64Ty)});
Value *LoadA =
Builder.CreateMaskedLoad(CharVTy, PNA, Align(1), PredA, Passthru);
Value *PredBInit = Builder.CreateIntrinsic(
@@ -1266,8 +1264,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
PHINode *PredBFull = Builder.CreatePHI(PredVTy, 2);
Value *CheckB = Builder.CreateICmpULT(PNB, EndB);
Builder.CreateCondBr(CheckB, BB4, BB1);
- DTU.applyUpdates({{DominatorTree::Insert, BB3, BB4},
- {DominatorTree::Insert, BB3, BB1}});
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB3, BB4}, {DominatorTree::Insert, BB3, BB1}});
// (4) Check load B.
Builder.SetInsertPoint(BB4);
@@ -1275,8 +1273,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
GEPB->isInBounds());
Value *IfNotFullB = Builder.CreateICmpUGT(IncB, EndB);
Builder.CreateCondBr(IfNotFullB, BB6, BB5);
- DTU.applyUpdates({{DominatorTree::Insert, BB4, BB6},
- {DominatorTree::Insert, BB4, BB5}});
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB4, BB6}, {DominatorTree::Insert, BB4, BB5}});
// (5) Full load B.
Builder.SetInsertPoint(BB5);
@@ -1287,10 +1285,10 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (6) Partial load B.
Builder.SetInsertPoint(BB6);
- Value *PredBPart = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePointerCast(PNB, I64Ty),
- Builder.CreatePointerCast(EndB, I64Ty)});
+ Value *PredBPart =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePointerCast(PNB, I64Ty),
+ Builder.CreatePointerCast(EndB, I64Ty)});
Value *LoadBPart =
Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBPart, Passthru);
Value *LoadB0 = Builder.CreateExtractElement(LoadBPart, uint64_t(0));
@@ -1309,8 +1307,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
{LoadA, LoadB, PredA, ConstantInt::get(I32Ty, VF)});
Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
Builder.CreateCondBr(IfAnyMatch, BB8, BB3);
- DTU.applyUpdates({{DominatorTree::Insert, BB7, BB8},
- {DominatorTree::Insert, BB7, BB3}});
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB7, BB8}, {DominatorTree::Insert, BB7, BB3}});
// (8) Match success.
Builder.SetInsertPoint(BB8);
@@ -1350,9 +1348,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
}
void LoopIdiomVectorize::transformFindFirstByte(
- PHINode *IndPhi, unsigned VF, unsigned CharWidth,
- BasicBlock *ExitSucc, BasicBlock *ExitFail,
- GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ PHINode *IndPhi, unsigned VF, unsigned CharWidth, BasicBlock *ExitSucc,
+ BasicBlock *ExitFail, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
// Insert the find first byte code at the end of the preheader block.
BasicBlock *Preheader = CurLoop->getLoopPreheader();
@@ -1362,8 +1359,8 @@ void LoopIdiomVectorize::transformFindFirstByte(
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
Value *MatchVal =
- expandFindFirstByte(Builder, DTU, VF, CharWidth, ExitSucc, ExitFail,
- GEPA, GEPB, StartA, EndA, StartB, EndB);
+ expandFindFirstByte(Builder, DTU, VF, CharWidth, ExitSucc, ExitFail, GEPA,
+ GEPB, StartA, EndA, StartB, EndB);
assert(PHBranch->isUnconditional() &&
"Expected preheader to terminate with an unconditional branch.");
@@ -1378,6 +1375,4 @@ void LoopIdiomVectorize::transformFindFirstByte(
// Maybe EliminateUnreachableBlocks ? I've left them for now because we may
// want to reuse them to implement an alternative path for small arrays, for
// example.
-
- //dbgs() << *Preheader->getParent() << "\n";
}
>From 8bd52cefd7361384b888853e647d37be436d8c5b Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 8 Aug 2024 17:54:10 +0100
Subject: [PATCH 5/5] [AArch64] Simplify find-first-byte
This simplifies the implementation by removing some of the alternative
code paths for specific sizes of the arguments.
---
.../Vectorize/LoopIdiomVectorize.cpp | 313 +++++++++---------
llvm/test/CodeGen/AArch64/find-first-byte.ll | 157 +++++----
2 files changed, 238 insertions(+), 232 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 9195f95b84451f..750260b904360d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -10,8 +10,10 @@
// transforms them into more optimized versions of the same loop. In cases
// where this happens, it can be a significant performance win.
//
-// We currently only recognize one loop that finds the first mismatched byte
-// in an array and returns the index, i.e. something like:
+// We currently support two loops:
+//
+// 1. A loop that finds the first mismatched byte in an array and returns the
+// index, i.e. something like:
//
// while (++i != n) {
// if (a[i] != b[i])
@@ -24,11 +26,27 @@
// boundaries. However, even with these checks it is still profitable to do the
// transformation.
//
+// 2. A loop that finds the first matching byte in an array among a set of
+// possible matches, e.g.:
+//
+// for (; first != last; ++first)
+// for (s_it = s_first; s_it != s_last; ++s_it)
+// if (*first == *s_it)
+// return first;
+// return last;
+//
+// This corresponds to std::find_first_of (for arrays of bytes) from the C++
+// standard library. This function can be implemented very efficiently for
+// targets that support @experimental.vector.match. For example, on AArch64
+// targets that implement SVE2, this lower to the MATCH instruction, which
+// enables us to perform 16x16=256 comparisons in one go. This can lead to very
+// significant speedups.
+//
//===----------------------------------------------------------------------===//
//
-// NOTE: This Pass matches a really specific loop pattern because it's only
+// NOTE: This Pass matches really specific loop patterns because it's only
// supposed to be a temporary solution until our LoopVectorizer is powerful
-// enought to vectorize it automatically.
+// enought to vectorize them automatically.
//
// TODO List:
//
@@ -148,13 +166,11 @@ class LoopIdiomVectorize {
Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
unsigned VF, unsigned CharWidth,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
- GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
Value *StartA, Value *EndA, Value *StartB,
Value *EndB);
void transformFindFirstByte(PHINode *IndPhi, unsigned VF, unsigned CharWidth,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
- GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
Value *StartA, Value *EndA, Value *StartB,
Value *EndB);
/// @}
@@ -1012,24 +1028,24 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
//
// .preheader:
// %14 = phi ptr [ %24, %23 ], [ %3, %.preheader.preheader ]
- // %15 = load i8, ptr %14, align 1, !tbaa !14
+ // %15 = load i8, ptr %14, align 1
// br label %19
//
// 19:
// %20 = phi ptr [ %7, %.preheader ], [ %17, %16 ]
- // %21 = load i8, ptr %20, align 1, !tbaa !14
+ // %21 = load i8, ptr %20, align 1
// %22 = icmp eq i8 %15, %21
// br i1 %22, label %.loopexit.loopexit, label %16
//
// 16:
// %17 = getelementptr inbounds i8, ptr %20, i64 1
// %18 = icmp eq ptr %17, %10
- // br i1 %18, label %23, label %19, !llvm.loop !15
+ // br i1 %18, label %23, label %19
//
// 23:
// %24 = getelementptr inbounds i8, ptr %14, i64 1
// %25 = icmp eq ptr %24, %6
- // br i1 %25, label %.loopexit.loopexit5, label %.preheader, !llvm.loop !17
+ // br i1 %25, label %.loopexit.loopexit5, label %.preheader
//
if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
LoopBlocks[1]->sizeWithoutDebug() > 4 ||
@@ -1153,16 +1169,15 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" << *CurLoop << "\n\n");
- transformFindFirstByte(IndPhi, VF, CharWidth, ExitSucc, ExitFail, GEPA, GEPB,
- StartA, EndA, StartB, EndB);
+ transformFindFirstByte(IndPhi, VF, CharWidth, ExitSucc, ExitFail, StartA,
+ EndA, StartB, EndB);
return true;
}
Value *LoopIdiomVectorize::expandFindFirstByte(
IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, unsigned CharWidth,
- BasicBlock *ExitSucc, BasicBlock *ExitFail, GetElementPtrInst *GEPA,
- GetElementPtrInst *GEPB, Value *StartA, Value *EndA, Value *StartB,
- Value *EndB) {
+ BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB) {
// Set up some types and constants that we intend to reuse.
auto *I64Ty = Builder.getInt64Ty();
auto *I32Ty = Builder.getInt32Ty();
@@ -1170,187 +1185,157 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
auto *CharTy = Builder.getIntNTy(CharWidth);
auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+ auto *ConstVF = ConstantInt::get(I32Ty, VF);
// Other common arguments.
BasicBlock *Preheader = CurLoop->getLoopPreheader();
- BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
- LLVMContext &Ctx = PHBranch->getContext();
+ LLVMContext &Ctx = Preheader->getContext();
Value *Passthru = ConstantInt::getNullValue(CharVTy);
// Split block in the original loop preheader.
- BasicBlock *OldPH = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "oldph");
-
- // Create the blocks that we're going to need. We separate them among outer
- // (OL) and inner (IL) loops with functions similar to those in the original
- // loops.
- // 1. Check that we have at least one element to load. (OL)
- // 2. Set up masks and load a vector of elements. (OL)
- // 3. Check that we have at least one key to match against. (IL)
- // 4. Check whether we can load a full register of keys. (IL)
- // 5. If so, load it. (IL)
- // 6. If not, set up a new mask, load the keys possible, and splat the
- // first one to the remainder of the register. (IL)
- // 7. Carry out the match test; if successful go to (8), otherwise loop
- // back to (3). (IL)
- // 8. Figure out the index of the match.
- // Note that only block (8) is *not* part of a loop (inner or outer).
-
- BasicBlock *BB1 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB2 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB3 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB4 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB5 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB6 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB7 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
- BasicBlock *BB8 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+ // SPH is the new preheader to the old scalar loop.
+ BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
+ nullptr, "scalar_ph");
+
+ // Create the blocks that we're going to use.
+ //
+ // We will have the following loops:
+ // (O) Outer loop where we iterate over the elements of the first array (A).
+ // (I) Inner loop where we iterate over the elements of the second array (B).
+ //
+ // Overall, the blocks created below will carry out the following actions:
+ // (1) Load a vector's worth of A. Go to (2).
+ // (2) (a) Load a vector's worth of B.
+ // (b) Splat the first element loaded to the inactive lanes.
+ // (c) Check if any elements match. If so go to (3), otherwise go to (4).
+ // (3) Compute the index of the first match and exit.
+ // (4) Check if we've reached the end of B. If not loop back to (2), otherwise
+ // go to (5).
+ // (5) Check if we've reached the end of A. If not loop back to (1), otherwise
+ // exit.
+ // Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the
+ // outer and inner loops, respectively.
+ BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB4 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
// Update LoopInfo with the new loops.
- auto OuterLoop = LI->AllocateLoop();
- auto InnerLoop = LI->AllocateLoop();
+ auto OL = LI->AllocateLoop();
+ auto IL = LI->AllocateLoop();
- if (CurLoop->getParentLoop()) {
- CurLoop->getParentLoop()->addBasicBlockToLoop(BB8, *LI);
- CurLoop->getParentLoop()->addChildLoop(OuterLoop);
+ if (auto ParentLoop = CurLoop->getParentLoop()) {
+ ParentLoop->addChildLoop(OL);
+ ParentLoop->addBasicBlockToLoop(BB3, *LI);
} else {
- LI->addTopLevelLoop(OuterLoop);
+ LI->addTopLevelLoop(OL);
}
// Add the inner loop to the outer.
- OuterLoop->addChildLoop(InnerLoop);
+ OL->addChildLoop(IL);
// Add the new basic blocks to the corresponding loops.
- OuterLoop->addBasicBlockToLoop(BB1, *LI);
- OuterLoop->addBasicBlockToLoop(BB2, *LI);
- InnerLoop->addBasicBlockToLoop(BB3, *LI);
- InnerLoop->addBasicBlockToLoop(BB4, *LI);
- InnerLoop->addBasicBlockToLoop(BB5, *LI);
- InnerLoop->addBasicBlockToLoop(BB6, *LI);
- InnerLoop->addBasicBlockToLoop(BB7, *LI);
-
- // Update the terminator added by SplitBlock to branch to the first block
- Preheader->getTerminator()->setSuccessor(0, BB1);
- DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1},
- {DominatorTree::Delete, Preheader, OldPH}});
-
- // (1) Check the outer loop iteration.
+ OL->addBasicBlockToLoop(BB1, *LI);
+ OL->addBasicBlockToLoop(BB5, *LI);
+ IL->addBasicBlockToLoop(BB2, *LI);
+ IL->addBasicBlockToLoop(BB4, *LI);
+
+ // Keep a reference to the old scalar loop.
+ Builder.SetInsertPoint(Preheader->getTerminator());
+ Builder.CreateCondBr(Builder.getFalse(), SPH, BB1);
+ Preheader->getTerminator()->eraseFromParent();
+ DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}});
+
+ // (1) Load a vector's worth of A and branch to the inner loop.
Builder.SetInsertPoint(BB1);
- PHINode *PNA = Builder.CreatePHI(PtrTy, 2, "pna");
- Value *CheckA = Builder.CreateICmpULT(PNA, EndA);
- Builder.CreateCondBr(CheckA, BB2, ExitFail);
- DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2},
- {DominatorTree::Insert, BB1, ExitFail}});
+ PHINode *PA = Builder.CreatePHI(PtrTy, 2, "pa");
- // (2) Outer loop body.
- Builder.SetInsertPoint(BB2);
- Value *IncA = Builder.CreateGEP(CharTy, PNA, ConstantInt::get(I64Ty, VF), "",
- GEPA->isInBounds());
- Value *CheckIncA = Builder.CreateICmpUGT(IncA, EndA);
- Value *SelA = Builder.CreateSelect(CheckIncA, EndA, IncA);
+ Value *IncA = Builder.CreateGEP(CharTy, PA, ConstVF);
+ Value *CheckA = Builder.CreateICmpULT(IncA, EndA);
+ Value *SelA = Builder.CreateSelect(CheckA, IncA, EndA);
Value *PredA =
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePointerCast(PNA, I64Ty),
+ {Builder.CreatePointerCast(PA, I64Ty),
Builder.CreatePointerCast(SelA, I64Ty)});
Value *LoadA =
- Builder.CreateMaskedLoad(CharVTy, PNA, Align(1), PredA, Passthru);
- Value *PredBInit = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {ConstantInt::get(I64Ty, 0), ConstantInt::get(I64Ty, VF)});
- Builder.CreateBr(BB3);
- DTU.applyUpdates({{DominatorTree::Insert, BB2, BB3}});
-
- // (3) Check the inner loop iteration.
- Builder.SetInsertPoint(BB3);
- PHINode *PNB = Builder.CreatePHI(PtrTy, 2, "pnb");
- PHINode *PredBFull = Builder.CreatePHI(PredVTy, 2);
- Value *CheckB = Builder.CreateICmpULT(PNB, EndB);
- Builder.CreateCondBr(CheckB, BB4, BB1);
- DTU.applyUpdates(
- {{DominatorTree::Insert, BB3, BB4}, {DominatorTree::Insert, BB3, BB1}});
+ Builder.CreateMaskedLoad(CharVTy, PA, Align(1), PredA, Passthru);
+ Builder.CreateBr(BB2);
+ DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
- // (4) Check load B.
- Builder.SetInsertPoint(BB4);
- Value *IncB = Builder.CreateGEP(CharTy, PNB, ConstantInt::get(I64Ty, VF), "",
- GEPB->isInBounds());
- Value *IfNotFullB = Builder.CreateICmpUGT(IncB, EndB);
- Builder.CreateCondBr(IfNotFullB, BB6, BB5);
- DTU.applyUpdates(
- {{DominatorTree::Insert, BB4, BB6}, {DominatorTree::Insert, BB4, BB5}});
+ // (2) Inner loop.
+ Builder.SetInsertPoint(BB2);
+ PHINode *PB = Builder.CreatePHI(PtrTy, 2, "pb");
- // (5) Full load B.
- Builder.SetInsertPoint(BB5);
- Value *LoadBFull =
- Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBFull, Passthru);
- Builder.CreateBr(BB7);
- DTU.applyUpdates({{DominatorTree::Insert, BB5, BB7}});
-
- // (6) Partial load B.
- Builder.SetInsertPoint(BB6);
- Value *PredBPart =
+ // (2.a) Load a vector's worth of B.
+ Value *IncB = Builder.CreateGEP(CharTy, PB, ConstVF);
+ Value *CheckB = Builder.CreateICmpULT(IncB, EndB);
+ Value *SelB = Builder.CreateSelect(CheckB, IncB, EndB);
+ Value *PredB =
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePointerCast(PNB, I64Ty),
- Builder.CreatePointerCast(EndB, I64Ty)});
+ {Builder.CreatePointerCast(PB, I64Ty),
+ Builder.CreatePointerCast(SelB, I64Ty)});
Value *LoadBPart =
- Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBPart, Passthru);
+ Builder.CreateMaskedLoad(CharVTy, PB, Align(1), PredB, Passthru);
+
+ // (2.b) Splat the first element to the inactive lanes.
Value *LoadB0 = Builder.CreateExtractElement(LoadBPart, uint64_t(0));
- Value *LoadBSplat =
+ Value *LoadB0Splat =
Builder.CreateVectorSplat(PredVTy->getElementCount(), LoadB0);
- LoadBPart = Builder.CreateSelect(PredBPart, LoadBPart, LoadBSplat);
- Builder.CreateBr(BB7);
- DTU.applyUpdates({{DominatorTree::Insert, BB6, BB7}});
-
- // (7) Carry out match.
- Builder.SetInsertPoint(BB7);
- PHINode *PredBNext = Builder.CreatePHI(PredVTy, 2);
- PHINode *LoadB = Builder.CreatePHI(CharVTy, 2);
- Value *MatchPred = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_match, {CharVTy},
- {LoadA, LoadB, PredA, ConstantInt::get(I32Ty, VF)});
+ Value *LoadB = Builder.CreateSelect(PredB, LoadBPart, LoadB0Splat);
+
+ // (2.c) Test if there's a match.
+ Value *MatchPred =
+ Builder.CreateIntrinsic(Intrinsic::experimental_vector_match, {CharVTy},
+ {LoadA, LoadB, PredA, ConstVF});
Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
- Builder.CreateCondBr(IfAnyMatch, BB8, BB3);
+ Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
DTU.applyUpdates(
- {{DominatorTree::Insert, BB7, BB8}, {DominatorTree::Insert, BB7, BB3}});
+ {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}});
- // (8) Match success.
- Builder.SetInsertPoint(BB8);
+ // (3) We found a match. Compute the index of its location and exit.
+ Builder.SetInsertPoint(BB3);
Value *MatchCnt = Builder.CreateIntrinsic(
Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
{MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
- Value *MatchVal = Builder.CreateGEP(CharTy, PNA, MatchCnt);
+ Value *MatchVal = Builder.CreateGEP(CharTy, PA, MatchCnt);
Builder.CreateBr(ExitSucc);
- DTU.applyUpdates({{DominatorTree::Insert, BB8, ExitSucc}});
+ DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
- // Set incoming values for PHIs.
- PNA->addIncoming(StartA, Preheader);
- PNA->addIncoming(IncA, BB3);
+ // (4) Check if we've reached the end of B.
+ Builder.SetInsertPoint(BB4);
+ Builder.CreateCondBr(CheckB, BB2, BB5);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
- PNB->addIncoming(StartB, BB2);
- PNB->addIncoming(IncB, BB7);
- PredBFull->addIncoming(PredBInit, BB2);
- PredBFull->addIncoming(PredBNext, BB7);
+ // (5) Check if we've reached the end of A.
+ Builder.SetInsertPoint(BB5);
+ Builder.CreateCondBr(CheckA, BB1, ExitFail);
+ DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
+ {DominatorTree::Insert, BB5, ExitFail}});
- PredBNext->addIncoming(PredBFull, BB5);
- PredBNext->addIncoming(PredBPart, BB6);
- LoadB->addIncoming(LoadBFull, BB5);
- LoadB->addIncoming(LoadBPart, BB6);
+ // Set up the PHI's.
+ PA->addIncoming(StartA, Preheader);
+ PA->addIncoming(IncA, BB5);
+ PB->addIncoming(StartB, BB1);
+ PB->addIncoming(IncB, BB4);
if (VerifyLoops) {
- OuterLoop->verifyLoop();
- InnerLoop->verifyLoop();
- if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
- report_fatal_error("Loops must remain in LCSSA form!");
- if (!InnerLoop->isRecursivelyLCSSAForm(*DT, *LI))
+ OL->verifyLoop();
+ IL->verifyLoop();
+ if (!OL->isRecursivelyLCSSAForm(*DT, *LI))
report_fatal_error("Loops must remain in LCSSA form!");
}
- assert(OldPH->hasNPredecessors(0) && "Expected old loop to be unreachable.");
-
return MatchVal;
}
-void LoopIdiomVectorize::transformFindFirstByte(
- PHINode *IndPhi, unsigned VF, unsigned CharWidth, BasicBlock *ExitSucc,
- BasicBlock *ExitFail, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
- Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
+void LoopIdiomVectorize::transformFindFirstByte(PHINode *IndPhi, unsigned VF,
+ unsigned CharWidth,
+ BasicBlock *ExitSucc,
+ BasicBlock *ExitFail,
+ Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB) {
// Insert the find first byte code at the end of the preheader block.
BasicBlock *Preheader = CurLoop->getLoopPreheader();
BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
@@ -1358,21 +1343,21 @@ void LoopIdiomVectorize::transformFindFirstByte(
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
- Value *MatchVal =
- expandFindFirstByte(Builder, DTU, VF, CharWidth, ExitSucc, ExitFail, GEPA,
- GEPB, StartA, EndA, StartB, EndB);
+ Value *MatchVal = expandFindFirstByte(Builder, DTU, VF, CharWidth, ExitSucc,
+ ExitFail, StartA, EndA, StartB, EndB);
- assert(PHBranch->isUnconditional() &&
- "Expected preheader to terminate with an unconditional branch.");
-
- // Add new incoming values with the result of the transformation to the old
- // uses of IndPhi in ExitSucc.
+ // Add new incoming values with the result of the transformation to PHINodes
+ // of ExitSucc that use IndPhi.
for (auto &PN : ExitSucc->phis())
- for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
- if (PN.getIncomingValue(i) == IndPhi)
+ for (auto const &V : PN.incoming_values())
+ if (V == IndPhi) {
PN.addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+ break;
+ }
- // Maybe EliminateUnreachableBlocks ? I've left them for now because we may
- // want to reuse them to implement an alternative path for small arrays, for
- // example.
+ if (VerifyLoops && CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->verifyLoop();
+ if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
}
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
index 4bf3df1d4817b6..95a053e85ad3f5 100644
--- a/llvm/test/CodeGen/AArch64/find-first-byte.ll
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mattr=+sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize),simplifycfg' -S < %s | FileCheck -check-prefix=SVE2 %s
-; RUN: opt -mattr=-sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize),simplifycfg' -S < %s | FileCheck -check-prefix=NOSVE2 %s
+; RUN: opt -mattr=+sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize)' -S < %s | FileCheck -check-prefix=SVE2 %s
+; RUN: opt -mattr=-sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize)' -S < %s | FileCheck -check-prefix=NOSVE2 %s
define dso_local noundef ptr @first_byte_of(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
; SVE2-LABEL: define dso_local noundef ptr @first_byte_of(
@@ -8,83 +8,104 @@ define dso_local noundef ptr @first_byte_of(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
; SVE2-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
; SVE2-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
; SVE2-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; SVE2-NEXT: br i1 [[TMP7]], [[DOTLOOPEXIT1:label %.*]], label %[[DOTPREHEADER:.*]]
+; SVE2-NEXT: br i1 [[TMP7]], label %[[BB46:.*]], label %[[DOTPREHEADER:.*]]
; SVE2: [[_PREHEADER:.*:]]
-; SVE2-NEXT: [[PNA:%.*]] = phi ptr [ [[TMP10:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], [[TMP4:%.*]] ]
-; SVE2-NEXT: [[TMP8:%.*]] = icmp ult ptr [[PNA]], [[TMP1]]
-; SVE2-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], [[DOTLOOPEXIT1]]
-; SVE2: [[BB9]]:
-; SVE2-NEXT: [[TMP10]] = getelementptr inbounds i8, ptr [[PNA]], i64 16
-; SVE2-NEXT: [[TMP11:%.*]] = icmp ugt ptr [[TMP10]], [[TMP1]]
-; SVE2-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], ptr [[TMP1]], ptr [[TMP10]]
-; SVE2-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[PNA]] to i64
-; SVE2-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP12]] to i64
-; SVE2-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP13]], i64 [[TMP14]])
-; SVE2-NEXT: [[TMP16:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNA]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> zeroinitializer)
-; SVE2-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
-; SVE2-NEXT: br label %[[TMP18]]
-; SVE2: [[TMP18]]:
-; SVE2-NEXT: [[PNB:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP22:%.*]], %[[TMP33:.*]] ]
-; SVE2-NEXT: [[TMP19:%.*]] = phi <vscale x 16 x i1> [ [[TMP17]], %[[BB9]] ], [ [[TMP34:%.*]], %[[TMP33]] ]
-; SVE2-NEXT: [[TMP20:%.*]] = icmp ult ptr [[PNB]], [[TMP3]]
-; SVE2-NEXT: br i1 [[TMP20]], label %[[BB21:.*]], label %[[DOTPREHEADER]]
-; SVE2: [[BB21]]:
-; SVE2-NEXT: [[TMP22]] = getelementptr inbounds i8, ptr [[PNB]], i64 16
-; SVE2-NEXT: [[TMP23:%.*]] = icmp ugt ptr [[TMP22]], [[TMP3]]
-; SVE2-NEXT: br i1 [[TMP23]], label %[[BB26:.*]], label %[[BB24:.*]]
-; SVE2: [[BB24]]:
-; SVE2-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNB]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
-; SVE2-NEXT: br label %[[TMP33]]
-; SVE2: [[BB26]]:
-; SVE2-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[PNB]] to i64
-; SVE2-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[TMP3]] to i64
-; SVE2-NEXT: [[TMP29:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP28]])
-; SVE2-NEXT: [[TMP30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNB]], i32 1, <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> zeroinitializer)
-; SVE2-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i8> [[TMP30]], i64 0
-; SVE2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP31]], i64 0
+; SVE2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB8:.*]]
+; SVE2: [[BB8]]:
+; SVE2-NEXT: [[PA:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP14:%.*]], %[[TMP37:.*]] ]
+; SVE2-NEXT: [[TMP14]] = getelementptr i8, ptr [[PA]], i32 16
+; SVE2-NEXT: [[TMP10:%.*]] = icmp ult ptr [[TMP14]], [[TMP1]]
+; SVE2-NEXT: [[TMP16:%.*]] = select i1 [[TMP10]], ptr [[TMP14]], ptr [[TMP1]]
+; SVE2-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[PA]] to i64
+; SVE2-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; SVE2-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP17]], i64 [[TMP18]])
+; SVE2-NEXT: [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PA]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT: br label %[[BB16:.*]]
+; SVE2: [[BB16]]:
+; SVE2-NEXT: [[PB:%.*]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP26:%.*]], %[[TMP38:.*]] ]
+; SVE2-NEXT: [[TMP26]] = getelementptr i8, ptr [[PB]], i32 16
+; SVE2-NEXT: [[TMP21:%.*]] = icmp ult ptr [[TMP26]], [[TMP3]]
+; SVE2-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], ptr [[TMP26]], ptr [[TMP3]]
+; SVE2-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[PB]] to i64
+; SVE2-NEXT: [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; SVE2-NEXT: [[TMP33:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP31]], i64 [[TMP32]])
+; SVE2-NEXT: [[TMP34:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PB]], i32 1, <vscale x 16 x i1> [[TMP33]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT: [[TMP35:%.*]] = extractelement <vscale x 16 x i8> [[TMP34]], i64 0
+; SVE2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP35]], i64 0
; SVE2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
-; SVE2-NEXT: [[TMP32:%.*]] = select <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> [[TMP30]], <vscale x 16 x i8> [[DOTSPLAT]]
-; SVE2-NEXT: br label %[[TMP33]]
-; SVE2: [[TMP33]]:
-; SVE2-NEXT: [[TMP34]] = phi <vscale x 16 x i1> [ [[TMP19]], %[[BB24]] ], [ [[TMP29]], %[[BB26]] ]
-; SVE2-NEXT: [[TMP35:%.*]] = phi <vscale x 16 x i8> [ [[TMP25]], %[[BB24]] ], [ [[TMP32]], %[[BB26]] ]
-; SVE2-NEXT: [[TMP36:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP35]], <vscale x 16 x i1> [[TMP15]], i32 16)
-; SVE2-NEXT: [[TMP37:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP36]])
-; SVE2-NEXT: br i1 [[TMP37]], label %[[DOTLOOPEXIT:.*]], label %[[TMP18]]
+; SVE2-NEXT: [[TMP36:%.*]] = select <vscale x 16 x i1> [[TMP33]], <vscale x 16 x i8> [[TMP34]], <vscale x 16 x i8> [[DOTSPLAT]]
+; SVE2-NEXT: [[TMP40:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x i8> [[TMP36]], <vscale x 16 x i1> [[TMP19]], i32 16)
+; SVE2-NEXT: [[TMP41:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP40]])
+; SVE2-NEXT: br i1 [[TMP41]], label %[[BB28:.*]], label %[[TMP38]]
+; SVE2: [[BB28]]:
+; SVE2-NEXT: [[TMP43:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP40]], i1 true)
+; SVE2-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[PA]], i64 [[TMP43]]
+; SVE2-NEXT: br label %[[DOTLOOPEXIT:.*]]
+; SVE2: [[TMP38]]:
+; SVE2-NEXT: br i1 [[TMP21]], label %[[BB16]], label %[[TMP37]]
+; SVE2: [[TMP37]]:
+; SVE2-NEXT: br i1 [[TMP10]], label %[[BB8]], label %[[DOTLOOPEXIT1:.*]]
+; SVE2: [[SCALAR_PH]]:
+; SVE2-NEXT: br label %[[BB33:.*]]
+; SVE2: [[BB33]]:
+; SVE2-NEXT: [[TMP46:%.*]] = phi ptr [ [[TMP56:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; SVE2-NEXT: [[TMP47:%.*]] = load i8, ptr [[TMP46]], align 1
+; SVE2-NEXT: br label %[[BB39:.*]]
+; SVE2: [[BB36:.*]]:
+; SVE2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[TMP52:%.*]], i64 1
+; SVE2-NEXT: [[TMP50:%.*]] = icmp eq ptr [[TMP49]], [[TMP3]]
+; SVE2-NEXT: br i1 [[TMP50]], label %[[TMP45]], label %[[BB39]]
+; SVE2: [[BB39]]:
+; SVE2-NEXT: [[TMP52]] = phi ptr [ [[TMP2]], %[[BB33]] ], [ [[TMP49]], %[[BB36]] ]
+; SVE2-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1
+; SVE2-NEXT: [[TMP54:%.*]] = icmp eq i8 [[TMP47]], [[TMP53]]
+; SVE2-NEXT: br i1 [[TMP54]], label %[[DOTLOOPEXIT]], label %[[BB36]]
+; SVE2: [[TMP45]]:
+; SVE2-NEXT: [[TMP56]] = getelementptr inbounds i8, ptr [[TMP46]], i64 1
+; SVE2-NEXT: [[TMP57:%.*]] = icmp eq ptr [[TMP56]], [[TMP1]]
+; SVE2-NEXT: br i1 [[TMP57]], label %[[DOTLOOPEXIT1]], label %[[BB33]]
; SVE2: [[_LOOPEXIT:.*:]]
-; SVE2-NEXT: [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP36]], i1 true)
-; SVE2-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[PNA]], i64 [[TMP38]]
-; SVE2-NEXT: br [[DOTLOOPEXIT1]]
+; SVE2-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP46]], %[[BB39]] ], [ [[TMP44]], %[[BB28]] ]
+; SVE2-NEXT: br label %[[BB46]]
; SVE2: [[_LOOPEXIT1:.*:]]
-; SVE2-NEXT: [[TMP40:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP39]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTPREHEADER]] ]
-; SVE2-NEXT: ret ptr [[TMP40]]
+; SVE2-NEXT: br label %[[BB46]]
+; SVE2: [[BB46]]:
+; SVE2-NEXT: [[TMP59:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; SVE2-NEXT: ret ptr [[TMP59]]
;
; NOSVE2-LABEL: define dso_local noundef ptr @first_byte_of(
; NOSVE2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
; NOSVE2-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
; NOSVE2-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
; NOSVE2-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; NOSVE2-NEXT: br i1 [[TMP7]], [[DOTLOOPEXIT:label %.*]], label %[[DOTPREHEADER:.*]]
+; NOSVE2-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; NOSVE2: [[_PREHEADER:.*:]]
+; NOSVE2-NEXT: br label %[[BB8:.*]]
+; NOSVE2: [[BB8]]:
+; NOSVE2-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; NOSVE2-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; NOSVE2-NEXT: br label %[[BB14:.*]]
+; NOSVE2: [[BB11:.*]]:
+; NOSVE2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; NOSVE2-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; NOSVE2-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; NOSVE2: [[BB14]]:
+; NOSVE2-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; NOSVE2-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; NOSVE2-NEXT: [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]]
+; NOSVE2-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; NOSVE2: [[TMP18]]:
+; NOSVE2-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; NOSVE2-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; NOSVE2-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
; NOSVE2: [[_LOOPEXIT:.*:]]
-; NOSVE2-NEXT: [[TMP8:%.*]] = phi ptr [ [[TMP18:%.*]], %[[TMP17:.*]] ], [ [[TMP0]], [[TMP4:%.*]] ]
-; NOSVE2-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
-; NOSVE2-NEXT: br label %[[BB13:.*]]
-; NOSVE2: [[BB10:.*]]:
-; NOSVE2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP14:%.*]], i64 1
-; NOSVE2-NEXT: [[TMP12:%.*]] = icmp eq ptr [[TMP11]], [[TMP3]]
-; NOSVE2-NEXT: br i1 [[TMP12]], label %[[TMP17]], label %[[BB13]]
-; NOSVE2: [[BB13]]:
-; NOSVE2-NEXT: [[TMP14]] = phi ptr [ [[TMP2]], %[[DOTPREHEADER]] ], [ [[TMP11]], %[[BB10]] ]
-; NOSVE2-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
-; NOSVE2-NEXT: [[TMP16:%.*]] = icmp eq i8 [[TMP9]], [[TMP15]]
-; NOSVE2-NEXT: br i1 [[TMP16]], [[DOTLOOPEXIT]], label %[[BB10]]
-; NOSVE2: [[TMP17]]:
-; NOSVE2-NEXT: [[TMP18]] = getelementptr inbounds i8, ptr [[TMP8]], i64 1
-; NOSVE2-NEXT: [[TMP19:%.*]] = icmp eq ptr [[TMP18]], [[TMP1]]
-; NOSVE2-NEXT: br i1 [[TMP19]], [[DOTLOOPEXIT]], label %[[DOTPREHEADER]]
-; NOSVE2: [[_LOOPEXIT2:.*:]]
-; NOSVE2-NEXT: [[TMP40:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP8]], %[[BB13]] ], [ [[TMP1]], %[[TMP17]] ]
-; NOSVE2-NEXT: ret ptr [[TMP40]]
+; NOSVE2-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; NOSVE2-NEXT: br label %[[BB21]]
+; NOSVE2: [[_LOOPEXIT1:.*:]]
+; NOSVE2-NEXT: br label %[[BB21]]
+; NOSVE2: [[BB21]]:
+; NOSVE2-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; NOSVE2-NEXT: ret ptr [[TMP22]]
;
%5 = icmp eq ptr %0, %1
%6 = icmp eq ptr %2, %3
More information about the llvm-commits
mailing list