[llvm] 5f84b6e - [AArch64] Add MATCH loops to LoopIdiomVectorizePass (#101976)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 10 00:23:38 PST 2025
Author: Ricardo Jesus
Date: 2025-02-10T08:23:34Z
New Revision: 5f84b6edd97153f1e5ec00ce110108ba8f6048bd
URL: https://github.com/llvm/llvm-project/commit/5f84b6edd97153f1e5ec00ce110108ba8f6048bd
DIFF: https://github.com/llvm/llvm-project/commit/5f84b6edd97153f1e5ec00ce110108ba8f6048bd.diff
LOG: [AArch64] Add MATCH loops to LoopIdiomVectorizePass (#101976)
This patch adds a new loop to LoopIdiomVectorizePass, enabling it to
recognise and vectorise loops such as:
```cpp
template<class InputIt, class ForwardIt>
InputIt find_first_of(InputIt first, InputIt last,
ForwardIt s_first, ForwardIt s_last)
{
for (; first != last; ++first)
for (ForwardIt it = s_first; it != s_last; ++it)
if (*first == *it)
return first;
return last;
}
```
These loops match the C++ standard library function `std::find_first_of`.
Added:
llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 7af7408ed67a8c2..90329200dd7e485 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -10,8 +10,10 @@
// transforms them into more optimized versions of the same loop. In cases
// where this happens, it can be a significant performance win.
//
-// We currently only recognize one loop that finds the first mismatched byte
-// in an array and returns the index, i.e. something like:
+// We currently support two loops:
+//
+// 1. A loop that finds the first mismatched byte in an array and returns the
+// index, i.e. something like:
//
// while (++i != n) {
// if (a[i] != b[i])
@@ -24,12 +26,6 @@
// boundaries. However, even with these checks it is still profitable to do the
// transformation.
//
-//===----------------------------------------------------------------------===//
-//
-// NOTE: This Pass matches a really specific loop pattern because it's only
-// supposed to be a temporary solution until our LoopVectorizer is powerful
-// enought to vectorize it automatically.
-//
// TODO List:
//
// * Add support for the inverse case where we scan for a matching element.
@@ -37,6 +33,35 @@
// * Recognize loops that increment the IV *after* comparing bytes.
// * Allow 32-bit sign-extends of the IV used by the GEP.
//
+// 2. A loop that finds the first matching character in an array among a set of
+// possible matches, e.g.:
+//
+// for (; first != last; ++first)
+// for (s_it = s_first; s_it != s_last; ++s_it)
+// if (*first == *s_it)
+// return first;
+// return last;
+//
+// This corresponds to std::find_first_of (for arrays of bytes) from the C++
+// standard library. This function can be implemented efficiently for targets
+// that support @llvm.experimental.vector.match. For example, on AArch64 targets
+// that implement SVE2, this lower to a MATCH instruction, which enables us to
+// perform up to 16x16=256 comparisons in one go. This can lead to very
+// significant speedups.
+//
+// TODO:
+//
+// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison).
+// * Make VF a configurable parameter (right now we assume 128-bit vectors).
+// * Potentially adjust the cost model to let the transformation kick-in even if
+// @llvm.experimental.vector.match doesn't have direct support in hardware.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: This Pass matches really specific loop patterns because it's only
+// supposed to be a temporary solution until our LoopVectorizer is powerful
+// enough to vectorize them automatically.
+//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
@@ -79,6 +104,11 @@ static cl::opt<unsigned>
cl::desc("The vectorization factor for byte-compare patterns."),
cl::init(16));
+static cl::opt<bool>
+ DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
+ cl::Hidden, cl::init(false),
+ cl::desc("Do not convert find-first-byte loop(s)."));
+
static cl::opt<bool>
VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
@@ -136,6 +166,19 @@ class LoopIdiomVectorize {
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
Value *Start, bool IncIdx, BasicBlock *FoundBB,
BasicBlock *EndBB);
+
+ bool recognizeFindFirstByte();
+
+ Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+ unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
+ BasicBlock *ExitFail, Value *SearchStart,
+ Value *SearchEnd, Value *NeedleStart,
+ Value *NeedleEnd);
+
+ void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ Value *SearchStart, Value *SearchEnd,
+ Value *NeedleStart, Value *NeedleEnd);
/// @}
};
} // anonymous namespace
@@ -190,7 +233,13 @@ bool LoopIdiomVectorize::run(Loop *L) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
<< CurLoop->getHeader()->getName() << "\n");
- return recognizeByteCompare();
+ if (recognizeByteCompare())
+ return true;
+
+ if (recognizeFindFirstByte())
+ return true;
+
+ return false;
}
bool LoopIdiomVectorize::recognizeByteCompare() {
@@ -939,3 +988,432 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
report_fatal_error("Loops must remain in LCSSA form!");
}
}
+
+bool LoopIdiomVectorize::recognizeFindFirstByte() {
+ // Currently the transformation only works on scalable vector types, although
+ // there is no fundamental reason why it cannot be made to work for fixed
+ // vectors. We also need to know the target's minimum page size in order to
+ // generate runtime memory checks to ensure the vector version won't fault.
+ if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
+ DisableFindFirstByte)
+ return false;
+
+ // Define some constants we need throughout.
+ BasicBlock *Header = CurLoop->getHeader();
+ LLVMContext &Ctx = Header->getContext();
+
+ // We are expecting the four blocks defined below: Header, MatchBB, InnerBB,
+ // and OuterBB. For now, we will bail our for almost anything else. The Four
+ // blocks contain one nested loop.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
+ CurLoop->getSubLoops().size() != 1)
+ return false;
+
+ auto *InnerLoop = CurLoop->getSubLoops().front();
+ PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
+ if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
+ return false;
+
+ // Check instruction counts.
+ auto LoopBlocks = CurLoop->getBlocks();
+ if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[1]->sizeWithoutDebug() > 4 ||
+ LoopBlocks[2]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[3]->sizeWithoutDebug() > 3)
+ return false;
+
+ // Check that no instruction other than IndPhi has outside uses.
+ for (BasicBlock *BB : LoopBlocks)
+ for (Instruction &I : *BB)
+ if (&I != IndPhi)
+ for (User *U : I.users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ return false;
+
+ // Match the branch instruction in the header. We are expecting an
+ // unconditional branch to the inner loop.
+ //
+ // Header:
+ // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
+ // %15 = load i8, ptr %14, align 1
+ // br label %MatchBB
+ BasicBlock *MatchBB;
+ if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
+ !InnerLoop->contains(MatchBB))
+ return false;
+
+ // MatchBB should be the entrypoint into the inner loop containing the
+ // comparison between a search element and a needle.
+ //
+ // MatchBB:
+ // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
+ // %21 = load i8, ptr %20, align 1
+ // %22 = icmp eq i8 %15, %21
+ // br i1 %22, label %ExitSucc, label %InnerBB
+ BasicBlock *ExitSucc, *InnerBB;
+ Value *LoadSearch, *LoadNeedle;
+ CmpPredicate MatchPred;
+ if (!match(MatchBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Value(LoadSearch), m_Value(LoadNeedle)),
+ m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
+ MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(InnerBB))
+ return false;
+
+ // We expect outside uses of `IndPhi' in ExitSucc (and only there).
+ for (User *U : IndPhi->users())
+ if (!CurLoop->contains(cast<Instruction>(U))) {
+ auto *PN = dyn_cast<PHINode>(U);
+ if (!PN || PN->getParent() != ExitSucc)
+ return false;
+ }
+
+ // Match the loads and check they are simple.
+ Value *Search, *Needle;
+ if (!match(LoadSearch, m_Load(m_Value(Search))) ||
+ !match(LoadNeedle, m_Load(m_Value(Needle))) ||
+ !cast<LoadInst>(LoadSearch)->isSimple() ||
+ !cast<LoadInst>(LoadNeedle)->isSimple())
+ return false;
+
+ // Check we are loading valid characters.
+ Type *CharTy = LoadSearch->getType();
+ if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy)
+ return false;
+
+ // Pick the vectorisation factor based on CharTy, work out the cost of the
+ // match intrinsic and decide if we should use it.
+ // Note: For the time being we assume 128-bit vectors.
+ unsigned VF = 128 / CharTy->getIntegerBitWidth();
+ SmallVector<Type *> Args = {
+ ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
+ ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)};
+ IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2],
+ Args);
+ if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
+ return false;
+
+ // The loads come from two PHIs, each with two incoming values.
+ PHINode *PSearch = dyn_cast<PHINode>(Search);
+ PHINode *PNeedle = dyn_cast<PHINode>(Needle);
+ if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle ||
+ PNeedle->getNumIncomingValues() != 2)
+ return false;
+
+ // One PHI comes from the outer loop (PSearch), the other one from the inner
+ // loop (PNeedle). PSearch effectively corresponds to IndPhi.
+ if (InnerLoop->contains(PSearch))
+ std::swap(PSearch, PNeedle);
+ if (PSearch != &Header->front() || PNeedle != &MatchBB->front())
+ return false;
+
+ // The incoming values of both PHI nodes should be a gep of 1.
+ Value *SearchStart = PSearch->getIncomingValue(0);
+ Value *SearchIndex = PSearch->getIncomingValue(1);
+ if (CurLoop->contains(PSearch->getIncomingBlock(0)))
+ std::swap(SearchStart, SearchIndex);
+
+ Value *NeedleStart = PNeedle->getIncomingValue(0);
+ Value *NeedleIndex = PNeedle->getIncomingValue(1);
+ if (InnerLoop->contains(PNeedle->getIncomingBlock(0)))
+ std::swap(NeedleStart, NeedleIndex);
+
+ // Match the GEPs.
+ if (!match(SearchIndex, m_GEP(m_Specific(PSearch), m_One())) ||
+ !match(NeedleIndex, m_GEP(m_Specific(PNeedle), m_One())))
+ return false;
+
+ // Check the GEPs result type matches `CharTy'.
+ GetElementPtrInst *GEPSearch = cast<GetElementPtrInst>(SearchIndex);
+ GetElementPtrInst *GEPNeedle = cast<GetElementPtrInst>(NeedleIndex);
+ if (GEPSearch->getResultElementType() != CharTy ||
+ GEPNeedle->getResultElementType() != CharTy)
+ return false;
+
+ // InnerBB should increment the address of the needle pointer.
+ //
+ // InnerBB:
+ // %17 = getelementptr inbounds i8, ptr %20, i64 1
+ // %18 = icmp eq ptr %17, %10
+ // br i1 %18, label %OuterBB, label %MatchBB
+ BasicBlock *OuterBB;
+ Value *NeedleEnd;
+ if (!match(InnerBB->getTerminator(),
+ m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPNeedle),
+ m_Value(NeedleEnd)),
+ m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
+ !CurLoop->contains(OuterBB))
+ return false;
+
+ // OuterBB should increment the address of the search element pointer.
+ //
+ // OuterBB:
+ // %24 = getelementptr inbounds i8, ptr %14, i64 1
+ // %25 = icmp eq ptr %24, %6
+ // br i1 %25, label %ExitFail, label %Header
+ BasicBlock *ExitFail;
+ Value *SearchEnd;
+ if (!match(OuterBB->getTerminator(),
+ m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPSearch),
+ m_Value(SearchEnd)),
+ m_BasicBlock(ExitFail), m_Specific(Header))))
+ return false;
+
+ if (!CurLoop->isLoopInvariant(SearchStart) ||
+ !CurLoop->isLoopInvariant(SearchEnd) ||
+ !CurLoop->isLoopInvariant(NeedleStart) ||
+ !CurLoop->isLoopInvariant(NeedleEnd))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
+
+ transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart,
+ SearchEnd, NeedleStart, NeedleEnd);
+ return true;
+}
+
+Value *LoopIdiomVectorize::expandFindFirstByte(
+ IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
+ Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
+ // Set up some types and constants that we intend to reuse.
+ auto *PtrTy = Builder.getPtrTy();
+ auto *I64Ty = Builder.getInt64Ty();
+ auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
+ auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+ auto *ConstVF = ConstantInt::get(I64Ty, VF);
+
+ // Other common arguments.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ LLVMContext &Ctx = Preheader->getContext();
+ Value *Passthru = ConstantInt::getNullValue(CharVTy);
+
+ // Split block in the original loop preheader.
+ // SPH is the new preheader to the old scalar loop.
+ BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
+ nullptr, "scalar_preheader");
+
+ // Create the blocks that we're going to use.
+ //
+ // We will have the following loops:
+ // (O) Outer loop where we iterate over the elements of the search array.
+ // (I) Inner loop where we iterate over the elements of the needle array.
+ //
+ // Overall, the blocks do the following:
+ // (0) Check if the arrays can't cross page boundaries. If so go to (1),
+ // otherwise fall back to the original scalar loop.
+ // (1) Load the search array. Go to (2).
+ // (2) (a) Load the needle array.
+ // (b) Splat the first element to the inactive lanes.
+ // (c) Check if any elements match. If so go to (3), otherwise go to (4).
+ // (3) Compute the index of the first match and exit.
+ // (4) Check if we've reached the end of the needle array. If not loop back to
+ // (2), otherwise go to (5).
+ // (5) Check if we've reached the end of the search array. If not loop back to
+ // (1), otherwise exit.
+ // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to
+ // the outer and inner loops, respectively.
+ BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH);
+ BasicBlock *BB1 =
+ BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH);
+ BasicBlock *BB2 =
+ BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH);
+ BasicBlock *BB3 =
+ BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH);
+ BasicBlock *BB4 =
+ BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH);
+ BasicBlock *BB5 =
+ BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH);
+
+ // Update LoopInfo with the new loops.
+ auto OuterLoop = LI->AllocateLoop();
+ auto InnerLoop = LI->AllocateLoop();
+
+ if (auto ParentLoop = CurLoop->getParentLoop()) {
+ ParentLoop->addBasicBlockToLoop(BB0, *LI);
+ ParentLoop->addChildLoop(OuterLoop);
+ ParentLoop->addBasicBlockToLoop(BB3, *LI);
+ } else {
+ LI->addTopLevelLoop(OuterLoop);
+ }
+
+ // Add the inner loop to the outer.
+ OuterLoop->addChildLoop(InnerLoop);
+
+ // Add the new basic blocks to the corresponding loops.
+ OuterLoop->addBasicBlockToLoop(BB1, *LI);
+ OuterLoop->addBasicBlockToLoop(BB5, *LI);
+ InnerLoop->addBasicBlockToLoop(BB2, *LI);
+ InnerLoop->addBasicBlockToLoop(BB4, *LI);
+
+ // Update the terminator added by SplitBlock to branch to the first block.
+ Preheader->getTerminator()->setSuccessor(0, BB0);
+ DTU.applyUpdates({{DominatorTree::Delete, Preheader, SPH},
+ {DominatorTree::Insert, Preheader, BB0}});
+
+ // (0) Check if we could be crossing a page boundary; if so, fallback to the
+ // old scalar loops. Also create a predicate of VF elements to be used in the
+ // vector loops.
+ Builder.SetInsertPoint(BB0);
+ Value *ISearchStart =
+ Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int");
+ Value *ISearchEnd =
+ Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int");
+ Value *INeedleStart =
+ Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int");
+ Value *INeedleEnd =
+ Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int");
+ Value *PredVF =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {ConstantInt::get(I64Ty, 0), ConstVF});
+
+ const uint64_t MinPageSize = TTI->getMinPageSize().value();
+ const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
+ Value *SearchStartPage =
+ Builder.CreateLShr(ISearchStart, AddrShiftAmt, "search_start_page");
+ Value *SearchEndPage =
+ Builder.CreateLShr(ISearchEnd, AddrShiftAmt, "search_end_page");
+ Value *NeedleStartPage =
+ Builder.CreateLShr(INeedleStart, AddrShiftAmt, "needle_start_page");
+ Value *NeedleEndPage =
+ Builder.CreateLShr(INeedleEnd, AddrShiftAmt, "needle_end_page");
+ Value *SearchPageCmp =
+ Builder.CreateICmpNE(SearchStartPage, SearchEndPage, "search_page_cmp");
+ Value *NeedlePageCmp =
+ Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage, "needle_page_cmp");
+
+ Value *CombinedPageCmp =
+ Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp");
+ BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
+ CombinedPageBr->setMetadata(LLVMContext::MD_prof,
+ MDBuilder(Ctx).createBranchWeights(10, 90));
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}});
+
+ // (1) Load the search array and branch to the inner loop.
+ Builder.SetInsertPoint(BB1);
+ PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
+ Value *PredSearch = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr,
+ "search_pred");
+ PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
+ Value *LoadSearch = Builder.CreateMaskedLoad(
+ CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
+ Builder.CreateBr(BB2);
+ DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
+
+ // (2) Inner loop.
+ Builder.SetInsertPoint(BB2);
+ PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
+
+ // (2.a) Load the needle array.
+ Value *PredNeedle = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr,
+ "needle_pred");
+ PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked");
+ Value *LoadNeedle = Builder.CreateMaskedLoad(
+ CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec");
+
+ // (2.b) Splat the first element to the inactive lanes.
+ Value *Needle0 =
+ Builder.CreateExtractElement(LoadNeedle, uint64_t(0), "needle0");
+ Value *Needle0Splat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+ Needle0, "needle0");
+ LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat,
+ "needle_splat");
+ LoadNeedle =
+ Builder.CreateExtractVector(FixedVectorType::get(CharTy, VF), LoadNeedle,
+ ConstantInt::get(I64Ty, 0), "needle_vec");
+
+ // (2.c) Test if there's a match.
+ Value *MatchPred = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
+ {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_pred");
+ Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+ Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}});
+
+ // (3) We found a match. Compute the index of its location and exit.
+ Builder.SetInsertPoint(BB3);
+ PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start");
+ PHINode *MatchPredLCSSA =
+ Builder.CreatePHI(MatchPred->getType(), 1, "match_vec");
+ Value *MatchCnt = Builder.CreateIntrinsic(
+ Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
+ {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr,
+ "match_idx");
+ Value *MatchVal =
+ Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res");
+ Builder.CreateBr(ExitSucc);
+ DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
+
+ // (4) Check if we've reached the end of the needle array.
+ Builder.SetInsertPoint(BB4);
+ Value *NextNeedle =
+ Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
+
+ // (5) Check if we've reached the end of the search array.
+ Builder.SetInsertPoint(BB5);
+ Value *NextSearch =
+ Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
+ ExitFail);
+ DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
+ {DominatorTree::Insert, BB5, ExitFail}});
+
+ // Set up the PHI nodes.
+ Search->addIncoming(SearchStart, BB0);
+ Search->addIncoming(NextSearch, BB5);
+ Needle->addIncoming(NeedleStart, BB1);
+ Needle->addIncoming(NextNeedle, BB4);
+ // These are needed to retain LCSSA form.
+ MatchLCSSA->addIncoming(Search, BB2);
+ MatchPredLCSSA->addIncoming(MatchPred, BB2);
+
+ if (VerifyLoops) {
+ OuterLoop->verifyLoop();
+ InnerLoop->verifyLoop();
+ if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+
+ return MatchVal;
+}
+
+void LoopIdiomVectorize::transformFindFirstByte(
+ PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
+ BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd,
+ Value *NeedleStart, Value *NeedleEnd) {
+ // Insert the find first byte code at the end of the preheader block.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ IRBuilder<> Builder(PHBranch);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+ Value *MatchVal =
+ expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
+ SearchStart, SearchEnd, NeedleStart, NeedleEnd);
+
+ assert(PHBranch->isUnconditional() &&
+ "Expected preheader to terminate with an unconditional branch.");
+
+ // Add new incoming values with the result of the transformation to PHINodes
+ // of ExitSucc that use IndPhi.
+ for (auto *U : llvm::make_early_inc_range(IndPhi->users())) {
+ auto *PN = dyn_cast<PHINode>(U);
+ if (PN && PN->getParent() == ExitSucc)
+ PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+ }
+
+ if (VerifyLoops && CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->verifyLoop();
+ if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+}
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
new file mode 100644
index 000000000000000..8ef2a515066064a
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -0,0 +1,671 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -disable-loop-idiom-vectorize-find-first-byte -S < %s | FileCheck -check-prefix=DISABLE %s
+
+; Base case based on `libcxx/include/__algorithm/find_first_of.h':
+; char* find_first_of(char *first, char *last, char *s_first, char *s_last) {
+; for (; first != last; ++first)
+; for (char *it = s_first; it != s_last; ++it)
+; if (*first == *it)
+; return first;
+; return last;
+; }
+define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8(
+; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; CHECK: [[HEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[MEM_CHECK:.*]]
+; CHECK: [[MEM_CHECK]]:
+; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
+; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
+; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
+; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
+; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
+; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
+; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
+; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
+; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
+; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK: [[FIND_FIRST_VEC_HEADER]]:
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
+; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK: [[CALCULATE_MATCH]]:
+; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
+; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
+; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
+; CHECK: [[SEARCH_CHECK_VEC]]:
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK: [[SCALAR_PREHEADER]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
+; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1
+; CHECK-NEXT: br label %[[MATCH_CHECK:.*]]
+; CHECK: [[NEEDLE_CHECK:.*]]:
+; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1
+; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; CHECK: [[MATCH_CHECK]]:
+; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1
+; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]]
+; CHECK: [[SEARCH_CHECK]]:
+; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1
+; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT_LOOPEXIT1]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[RES]]
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8(
+; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] {
+; DISABLE-NEXT: [[ENTRY:.*]]:
+; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; DISABLE: [[HEADER_PREHEADER]]:
+; DISABLE-NEXT: br label %[[HEADER:.*]]
+; DISABLE: [[HEADER]]:
+; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
+; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1
+; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]]
+; DISABLE: [[NEEDLE_CHECK:.*]]:
+; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1
+; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; DISABLE: [[MATCH_CHECK]]:
+; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1
+; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]]
+; DISABLE: [[SEARCH_CHECK]]:
+; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1
+; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]]
+; DISABLE: [[EXIT_LOOPEXIT]]:
+; DISABLE-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
+; DISABLE-NEXT: br label %[[EXIT]]
+; DISABLE: [[EXIT_LOOPEXIT1]]:
+; DISABLE-NEXT: br label %[[EXIT]]
+; DISABLE: [[EXIT]]:
+; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; DISABLE-NEXT: ret ptr [[RES]]
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i8, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i8, ptr %needle_ptr, align 1
+ %match_cmp = icmp eq i8 %search_load, %needle_load
+ br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit, label %header
+
+exit:
+ %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+ ret ptr %res
+}
+
+; Equivalent to @find_first_of_i8 but with i16.
+; This is accepted and generates a similar loop.
+define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i16(
+; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; CHECK: [[HEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[MEM_CHECK:.*]]
+; CHECK: [[MEM_CHECK]]:
+; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
+; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
+; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
+; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
+; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
+; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
+; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
+; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
+; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
+; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
+; CHECK: [[FIND_FIRST_VEC_HEADER]]:
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[NEEDLE0]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], <vscale x 8 x i16> [[NEEDLE0_SPLAT]]
+; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[NEEDLE_SPLAT]], i64 0)
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], <vscale x 8 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK: [[CALCULATE_MATCH]]:
+; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[MATCH_VEC]], i1 true)
+; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
+; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
+; CHECK: [[SEARCH_CHECK_VEC]]:
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK: [[SCALAR_PREHEADER]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
+; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1
+; CHECK-NEXT: br label %[[MATCH_CHECK:.*]]
+; CHECK: [[NEEDLE_CHECK:.*]]:
+; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1
+; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; CHECK: [[MATCH_CHECK]]:
+; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1
+; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]]
+; CHECK: [[SEARCH_CHECK]]:
+; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1
+; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT_LOOPEXIT1]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[RES]]
+;
+; DISABLE-LABEL: define ptr @find_first_of_i16(
+; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
+; DISABLE-NEXT: [[ENTRY:.*]]:
+; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; DISABLE: [[HEADER_PREHEADER]]:
+; DISABLE-NEXT: br label %[[HEADER:.*]]
+; DISABLE: [[HEADER]]:
+; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
+; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1
+; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]]
+; DISABLE: [[NEEDLE_CHECK:.*]]:
+; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1
+; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; DISABLE: [[MATCH_CHECK]]:
+; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1
+; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]]
+; DISABLE: [[SEARCH_CHECK]]:
+; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1
+; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]]
+; DISABLE: [[EXIT_LOOPEXIT]]:
+; DISABLE-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
+; DISABLE-NEXT: br label %[[EXIT]]
+; DISABLE: [[EXIT_LOOPEXIT1]]:
+; DISABLE-NEXT: br label %[[EXIT]]
+; DISABLE: [[EXIT]]:
+; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; DISABLE-NEXT: ret ptr [[RES]]
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i16, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i16, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i16, ptr %needle_ptr, align 1
+ %match_cmp = icmp eq i16 %search_load, %needle_load
+ br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i16, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit, label %header
+
+exit:
+ %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+ ret ptr %res
+}
+
+; Same as @find_first_of_i8 but with two intermediate exit blocks for the
+; "success" (exit_succ) and "failure" (exit_fail) paths.
+define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8_multi_exit(
+; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT_FAIL:.*]], label %[[HEADER_PREHEADER:.*]]
+; CHECK: [[HEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[MEM_CHECK:.*]]
+; CHECK: [[MEM_CHECK]]:
+; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
+; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
+; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
+; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
+; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
+; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
+; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
+; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
+; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
+; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
+; CHECK: [[FIND_FIRST_VEC_HEADER]]:
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
+; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK: [[CALCULATE_MATCH]]:
+; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
+; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
+; CHECK-NEXT: br label %[[EXIT_SUCC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
+; CHECK: [[SEARCH_CHECK_VEC]]:
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]]
+; CHECK: [[SCALAR_PREHEADER]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
+; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1
+; CHECK-NEXT: br label %[[MATCH_CHECK:.*]]
+; CHECK: [[NEEDLE_CHECK:.*]]:
+; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1
+; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; CHECK: [[MATCH_CHECK]]:
+; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1
+; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_SUCC]], label %[[NEEDLE_CHECK]]
+; CHECK: [[SEARCH_CHECK]]:
+; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1
+; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_FAIL_LOOPEXIT]], label %[[HEADER]]
+; CHECK: [[EXIT_SUCC]]:
+; CHECK-NEXT: [[RES_SUCC:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT_FAIL_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[EXIT_FAIL]]
+; CHECK: [[EXIT_FAIL]]:
+; CHECK-NEXT: [[RES_FAIL:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_END]], %[[EXIT_FAIL_LOOPEXIT]] ]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[RES_SUCC]], %[[EXIT_SUCC]] ], [ [[RES_FAIL]], %[[EXIT_FAIL]] ]
+; CHECK-NEXT: ret ptr [[RES]]
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8_multi_exit(
+; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
+; DISABLE-NEXT: [[ENTRY:.*]]:
+; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT_FAIL:.*]], label %[[HEADER_PREHEADER:.*]]
+; DISABLE: [[HEADER_PREHEADER]]:
+; DISABLE-NEXT: br label %[[HEADER:.*]]
+; DISABLE: [[HEADER]]:
+; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
+; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1
+; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]]
+; DISABLE: [[NEEDLE_CHECK:.*]]:
+; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1
+; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; DISABLE: [[MATCH_CHECK]]:
+; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1
+; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_SUCC:.*]], label %[[NEEDLE_CHECK]]
+; DISABLE: [[SEARCH_CHECK]]:
+; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1
+; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_FAIL_LOOPEXIT:.*]], label %[[HEADER]]
+; DISABLE: [[EXIT_SUCC]]:
+; DISABLE-NEXT: [[RES_SUCC:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
+; DISABLE-NEXT: br label %[[EXIT:.*]]
+; DISABLE: [[EXIT_FAIL_LOOPEXIT]]:
+; DISABLE-NEXT: br label %[[EXIT_FAIL]]
+; DISABLE: [[EXIT_FAIL]]:
+; DISABLE-NEXT: [[RES_FAIL:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_END]], %[[EXIT_FAIL_LOOPEXIT]] ]
+; DISABLE-NEXT: br label %[[EXIT]]
+; DISABLE: [[EXIT]]:
+; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[RES_SUCC]], %[[EXIT_SUCC]] ], [ [[RES_FAIL]], %[[EXIT_FAIL]] ]
+; DISABLE-NEXT: ret ptr [[RES]]
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit_fail, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i8, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i8, ptr %needle_ptr, align 1
+ %match_cmp = icmp eq i8 %search_load, %needle_load
+ br i1 %match_cmp, label %exit_succ, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit_fail, label %header
+
+exit_succ:
+ %res_succ = phi ptr [ %search_ptr, %match_check ]
+ br label %exit
+
+exit_fail:
+ %res_fail = phi ptr [ %search_end, %entry ], [ %search_end, %search_check ]
+ br label %exit
+
+exit:
+ %res = phi ptr [ %res_succ, %exit_succ ], [ %res_fail, %exit_fail ]
+ ret ptr %res
+}
+
+; From here on we only test for the presence/absence of the intrinsic.
+; UTC_ARGS: --disable
+
+; Same as @find_first_of_i8 but with `ne' comparison.
+; This is rejected for now, but should eventually be supported.
+define ptr @find_first_not_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
+; CHECK-LABEL: define ptr @find_first_not_of_i8(
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_not_of_i8(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i8, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i8, ptr %needle_ptr, align 1
+ %match_cmp = icmp ne i8 %search_load, %needle_load
+ br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit, label %header
+
+exit:
+ %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+ ret ptr %res
+}
+
+; This is the same as @find_first_of_i8 but without SVE2, which we require to
+; perform the conversion.
+define ptr @find_first_of_i8_nosve2(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) {
+; CHECK-LABEL: define ptr @find_first_of_i8_nosve2(
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8_nosve2(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i8, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i8, ptr %needle_ptr, align 1
+ %match_cmp = icmp eq i8 %search_load, %needle_load
+ br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit, label %header
+
+exit:
+ %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+ ret ptr %res
+}
+
+; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest.
+; This isn't supported.
+define ptr @find_first_of_i8_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8_outside_use(
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8_outside_use(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i8, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i8, ptr %needle_ptr, align 1
+ %match_cmp = icmp eq i8 %search_load, %needle_load
+ br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit, label %header
+
+exit:
+ %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+ %use = phi ptr [ %needle_end, %entry ], [ %needle_ptr, %match_check ], [ %needle_end, %search_check ]
+ ret ptr %res
+}
+
+; Same as @find_first_of_i8_multi_exit but `search_ptr' is used in `exit_fail'
+; which should block the transform.
+define ptr @find_first_of_i8_multi_exit_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8_multi_exit_outside_use(
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8_multi_exit_outside_use(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+entry:
+ %search_test = icmp eq ptr %search_start, %search_end
+ %needle_test = icmp eq ptr %needle_start, %needle_end
+ %combined_test = or i1 %search_test, %needle_test
+ br i1 %combined_test, label %exit_fail, label %header
+
+header:
+ %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+ %search_load = load i8, ptr %search_ptr, align 1
+ br label %match_check
+
+needle_check:
+ %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+ %needle_cmp = icmp eq ptr %needle_next, %needle_end
+ br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+ %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+ %needle_load = load i8, ptr %needle_ptr, align 1
+ %match_cmp = icmp eq i8 %search_load, %needle_load
+ br i1 %match_cmp, label %exit_succ, label %needle_check
+
+search_check:
+ %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+ %search_cmp = icmp eq ptr %search_next, %search_end
+ br i1 %search_cmp, label %exit_fail, label %header
+
+exit_succ:
+ %res_succ = phi ptr [ %search_ptr, %match_check ]
+ br label %exit
+
+exit_fail:
+ %res_fail = phi ptr [ %search_end, %entry ], [ %search_ptr, %search_check ]
+ br label %exit
+
+exit:
+ %res = phi ptr [ %res_succ, %exit_succ ], [ %res_fail, %exit_fail ]
+ ret ptr %res
+}
+
+attributes #0 = { "target-features"="+sve2" }
+
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90}
More information about the llvm-commits
mailing list