[llvm] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (PR #101976)

Ricardo Jesus via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 5 06:13:34 PST 2025


https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/101976

>From 2addb7b55d40762a08a727f07567961acc05e260 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 15 Jul 2024 17:57:30 +0100
Subject: [PATCH 1/6] [AArch64] Add MATCH loops to LoopIdiomVectorizePass

This patch adds a new loop to LoopIdiomVectorizePass, enabling it to
recognise and use @llvm.experimental.vector.match to vectorise loops
such as:

    char* find_first_of(char *first, char *last,
                        char *s_first, char *s_last) {
      for (; first != last; ++first)
        for (char *it = s_first; it != s_last; ++it)
          if (*first == *it)
            return first;
      return last;
    }

These loops match the C++ standard library's std::find_first_of.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 423 ++++++++++++++++-
 llvm/test/CodeGen/AArch64/find-first-byte.ll  | 429 ++++++++++++++++++
 2 files changed, 843 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/find-first-byte.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 7af7408ed67a8c2..a874dd9f8f181ba 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -10,8 +10,10 @@
 // transforms them into more optimized versions of the same loop. In cases
 // where this happens, it can be a significant performance win.
 //
-// We currently only recognize one loop that finds the first mismatched byte
-// in an array and returns the index, i.e. something like:
+// We currently support two loops:
+//
+// 1. A loop that finds the first mismatched byte in an array and returns the
+// index, i.e. something like:
 //
 //  while (++i != n) {
 //    if (a[i] != b[i])
@@ -24,12 +26,6 @@
 // boundaries. However, even with these checks it is still profitable to do the
 // transformation.
 //
-//===----------------------------------------------------------------------===//
-//
-// NOTE: This Pass matches a really specific loop pattern because it's only
-// supposed to be a temporary solution until our LoopVectorizer is powerful
-// enought to vectorize it automatically.
-//
 // TODO List:
 //
 // * Add support for the inverse case where we scan for a matching element.
@@ -37,6 +33,35 @@
 // * Recognize loops that increment the IV *after* comparing bytes.
 // * Allow 32-bit sign-extends of the IV used by the GEP.
 //
+// 2. A loop that finds the first matching character in an array among a set of
+// possible matches, e.g.:
+//
+//   for (; first != last; ++first)
+//     for (s_it = s_first; s_it != s_last; ++s_it)
+//       if (*first == *s_it)
+//         return first;
+//   return last;
+//
+// This corresponds to std::find_first_of (for arrays of bytes) from the C++
+// standard library. This function can be implemented efficiently for targets
+// that support @llvm.experimental.vector.match. For example, on AArch64 targets
+// that implement SVE2, this lower to a MATCH instruction, which enables us to
+// perform up to 16x16=256 comparisons in one go. This can lead to very
+// significant speedups.
+//
+// TODO:
+//
+// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison).
+// * Make VF a configurable parameter (right now we assume 128-bit vectors).
+// * Potentially adjust the cost model to let the transformation kick-in even if
+//   @llvm.experimental.vector.match doesn't have direct support in hardware.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: This Pass matches really specific loop patterns because it's only
+// supposed to be a temporary solution until our LoopVectorizer is powerful
+// enought to vectorize them automatically.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
@@ -79,6 +104,11 @@ static cl::opt<unsigned>
               cl::desc("The vectorization factor for byte-compare patterns."),
               cl::init(16));
 
+static cl::opt<bool>
+    DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
+                         cl::Hidden, cl::init(false),
+                         cl::desc("Do not convert find-first-byte loop(s)."));
+
 static cl::opt<bool>
     VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
                 cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
@@ -136,6 +166,19 @@ class LoopIdiomVectorize {
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
                             Value *Start, bool IncIdx, BasicBlock *FoundBB,
                             BasicBlock *EndBB);
+
+  bool recognizeFindFirstByte();
+
+  Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+                             unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
+                             BasicBlock *ExitFail, Value *SearchStart,
+                             Value *SearchEnd, Value *NeedleStart,
+                             Value *NeedleEnd);
+
+  void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
+                              BasicBlock *ExitSucc, BasicBlock *ExitFail,
+                              Value *SearchStart, Value *SearchEnd,
+                              Value *NeedleStart, Value *NeedleEnd);
   /// @}
 };
 } // anonymous namespace
@@ -190,7 +233,13 @@ bool LoopIdiomVectorize::run(Loop *L) {
   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
                     << CurLoop->getHeader()->getName() << "\n");
 
-  return recognizeByteCompare();
+  if (recognizeByteCompare())
+    return true;
+
+  if (recognizeFindFirstByte())
+    return true;
+
+  return false;
 }
 
 bool LoopIdiomVectorize::recognizeByteCompare() {
@@ -939,3 +988,359 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
       report_fatal_error("Loops must remain in LCSSA form!");
   }
 }
+
+bool LoopIdiomVectorize::recognizeFindFirstByte() {
+  // Currently the transformation only works on scalable vector types, although
+  // there is no fundamental reason why it cannot be made to work for fixed
+  // vectors too.
+  if (!TTI->supportsScalableVectors() || DisableFindFirstByte)
+    return false;
+
+  // Define some constants we need throughout.
+  BasicBlock *Header = CurLoop->getHeader();
+  LLVMContext &Ctx = Header->getContext();
+
+  // We are expecting the four blocks defined below: Header, MatchBB, InnerBB,
+  // and OuterBB. For now, we will bail our for almost anything else. The Four
+  // blocks contain one nested loop.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
+      CurLoop->getSubLoops().size() != 1)
+    return false;
+
+  auto *InnerLoop = CurLoop->getSubLoops().front();
+  PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
+  if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
+    return false;
+
+  // Check instruction counts.
+  auto LoopBlocks = CurLoop->getBlocks();
+  if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
+      LoopBlocks[1]->sizeWithoutDebug() > 4 ||
+      LoopBlocks[2]->sizeWithoutDebug() > 3 ||
+      LoopBlocks[3]->sizeWithoutDebug() > 3)
+    return false;
+
+  // Check that no instruction other than IndPhi has outside uses.
+  for (BasicBlock *BB : LoopBlocks)
+    for (Instruction &I : *BB)
+      if (&I != IndPhi)
+        for (User *U : I.users())
+          if (!CurLoop->contains(cast<Instruction>(U)))
+            return false;
+
+  // Match the branch instruction in the header. We are expecting an
+  // unconditional branch to the inner loop.
+  //
+  // Header:
+  //   %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
+  //   %15 = load i8, ptr %14, align 1
+  //   br label %MatchBB
+  BasicBlock *MatchBB;
+  if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
+      !InnerLoop->contains(MatchBB))
+    return false;
+
+  // MatchBB should be the entrypoint into the inner loop containing the
+  // comparison between a search element and a needle.
+  //
+  // MatchBB:
+  //   %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
+  //   %21 = load i8, ptr %20, align 1
+  //   %22 = icmp eq i8 %15, %21
+  //   br i1 %22, label %ExitSucc, label %InnerBB
+  BasicBlock *ExitSucc, *InnerBB;
+  Value *LoadA, *LoadB;
+  ICmpInst::Predicate MatchPred;
+  if (!match(MatchBB->getTerminator(),
+             m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)),
+                  m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
+      MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+      !InnerLoop->contains(InnerBB))
+    return false;
+
+  // We expect outside uses of `IndPhi' in ExitSucc (and only there).
+  for (User *U : IndPhi->users())
+    if (!CurLoop->contains(cast<Instruction>(U)))
+      if (auto *PN = dyn_cast<PHINode>(U); !PN || PN->getParent() != ExitSucc)
+        return false;
+
+  // Match the loads and check they are simple.
+  Value *A, *B;
+  if (!match(LoadA, m_Load(m_Value(A))) || !cast<LoadInst>(LoadA)->isSimple() ||
+      !match(LoadB, m_Load(m_Value(B))) || !cast<LoadInst>(LoadB)->isSimple())
+    return false;
+
+  // Check we are loading valid characters.
+  Type *CharTy = LoadA->getType();
+  if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy)
+    return false;
+
+  // Pick the vectorisation factor based on CharTy, work out the cost of the
+  // match intrinsic and decide if we should use it.
+  // Note: For the time being we assume 128-bit vectors.
+  unsigned VF = 128 / CharTy->getIntegerBitWidth();
+  SmallVector<Type *> Args = {
+      ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
+      ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)};
+  IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2],
+                                Args);
+  if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
+    return false;
+
+  // The loads come from two PHIs, each with two incoming values.
+  PHINode *PNA = dyn_cast<PHINode>(A);
+  PHINode *PNB = dyn_cast<PHINode>(B);
+  if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB ||
+      PNB->getNumIncomingValues() != 2)
+    return false;
+
+  // One PHI comes from the outer loop (PNA), the other one from the inner loop
+  // (PNB). PNA effectively corresponds to IndPhi.
+  if (InnerLoop->contains(PNA))
+    std::swap(PNA, PNB);
+  if (PNA != &Header->front() || PNB != &MatchBB->front())
+    return false;
+
+  // The incoming values of both PHI nodes should be a gep of 1.
+  Value *StartA = PNA->getIncomingValue(0);
+  Value *IndexA = PNA->getIncomingValue(1);
+  if (CurLoop->contains(PNA->getIncomingBlock(0)))
+    std::swap(StartA, IndexA);
+
+  Value *StartB = PNB->getIncomingValue(0);
+  Value *IndexB = PNB->getIncomingValue(1);
+  if (InnerLoop->contains(PNB->getIncomingBlock(0)))
+    std::swap(StartB, IndexB);
+
+  // Match the GEPs.
+  if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) ||
+      !match(IndexB, m_GEP(m_Specific(PNB), m_One())))
+    return false;
+
+  // Check the GEPs result type matches `CharTy'.
+  GetElementPtrInst *GEPA = cast<GetElementPtrInst>(IndexA);
+  GetElementPtrInst *GEPB = cast<GetElementPtrInst>(IndexB);
+  if (GEPA->getResultElementType() != CharTy ||
+      GEPB->getResultElementType() != CharTy)
+    return false;
+
+  // InnerBB should increment the address of the needle pointer.
+  //
+  // InnerBB:
+  //   %17 = getelementptr inbounds i8, ptr %20, i64 1
+  //   %18 = icmp eq ptr %17, %10
+  //   br i1 %18, label %OuterBB, label %MatchBB
+  BasicBlock *OuterBB;
+  Value *EndB;
+  if (!match(InnerBB->getTerminator(),
+             m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)),
+                  m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
+      MatchPred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(OuterBB))
+    return false;
+
+  // OuterBB should increment the address of the search element pointer.
+  //
+  // OuterBB:
+  //   %24 = getelementptr inbounds i8, ptr %14, i64 1
+  //   %25 = icmp eq ptr %24, %6
+  //   br i1 %25, label %ExitFail, label %Header
+  BasicBlock *ExitFail;
+  Value *EndA;
+  if (!match(OuterBB->getTerminator(),
+             m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)),
+                  m_BasicBlock(ExitFail), m_Specific(Header))) ||
+      MatchPred != ICmpInst::Predicate::ICMP_EQ)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
+
+  transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, StartA, EndA,
+                         StartB, EndB);
+  return true;
+}
+
+Value *LoopIdiomVectorize::expandFindFirstByte(
+    IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
+    BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
+    Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
+  // Set up some types and constants that we intend to reuse.
+  auto *PtrTy = Builder.getPtrTy();
+  auto *I64Ty = Builder.getInt64Ty();
+  auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
+  auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+  auto *ConstVF = ConstantInt::get(I64Ty, VF);
+
+  // Other common arguments.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  LLVMContext &Ctx = Preheader->getContext();
+  Value *Passthru = ConstantInt::getNullValue(CharVTy);
+
+  // Split block in the original loop preheader.
+  // SPH is the new preheader to the old scalar loop.
+  BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
+                               nullptr, "scalar_ph");
+
+  // Create the blocks that we're going to use.
+  //
+  // We will have the following loops:
+  // (O) Outer loop where we iterate over the elements of the search array.
+  // (I) Inner loop where we iterate over the elements of the needle array.
+  //
+  // Overall, the blocks do the following:
+  // (1) Load the search array. Go to (2).
+  // (2) (a) Load the needle array.
+  //     (b) Splat the first element to the inactive lanes.
+  //     (c) Check if any elements match. If so go to (3), otherwise go to (4).
+  // (3) Compute the index of the first match and exit.
+  // (4) Check if we've reached the end of the needle array. If not loop back to
+  //     (2), otherwise go to (5).
+  // (5) Check if we've reached the end of the search array. If not loop back to
+  //     (1), otherwise exit.
+  // Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the
+  // outer and inner loops, respectively.
+  BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+  BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+  BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+  BasicBlock *BB4 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+  BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+
+  // Update LoopInfo with the new loops.
+  auto OuterLoop = LI->AllocateLoop();
+  auto InnerLoop = LI->AllocateLoop();
+
+  if (auto ParentLoop = CurLoop->getParentLoop()) {
+    ParentLoop->addChildLoop(OuterLoop);
+    ParentLoop->addBasicBlockToLoop(BB3, *LI);
+  } else {
+    LI->addTopLevelLoop(OuterLoop);
+  }
+
+  // Add the inner loop to the outer.
+  OuterLoop->addChildLoop(InnerLoop);
+
+  // Add the new basic blocks to the corresponding loops.
+  OuterLoop->addBasicBlockToLoop(BB1, *LI);
+  OuterLoop->addBasicBlockToLoop(BB5, *LI);
+  InnerLoop->addBasicBlockToLoop(BB2, *LI);
+  InnerLoop->addBasicBlockToLoop(BB4, *LI);
+
+  // Set a reference to the old scalar loop and create a predicate of VF
+  // elements.
+  Builder.SetInsertPoint(Preheader->getTerminator());
+  Value *Pred16 =
+      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+                              {ConstantInt::get(I64Ty, 0), ConstVF});
+  Builder.CreateCondBr(Builder.getFalse(), SPH, BB1);
+  Preheader->getTerminator()->eraseFromParent();
+  DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}});
+
+  // (1) Load the search array and branch to the inner loop.
+  Builder.SetInsertPoint(BB1);
+  PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
+  Value *PredSearch =
+      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+                              {Builder.CreatePointerCast(Search, I64Ty),
+                               Builder.CreatePointerCast(SearchEnd, I64Ty)});
+  PredSearch = Builder.CreateAnd(Pred16, PredSearch);
+  Value *LoadSearch =
+      Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru);
+  Builder.CreateBr(BB2);
+  DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
+
+  // (2) Inner loop.
+  Builder.SetInsertPoint(BB2);
+  PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
+
+  // (2.a) Load the needle array.
+  Value *PredNeedle =
+      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+                              {Builder.CreatePointerCast(Needle, I64Ty),
+                               Builder.CreatePointerCast(NeedleEnd, I64Ty)});
+  PredNeedle = Builder.CreateAnd(Pred16, PredNeedle);
+  Value *LoadNeedle =
+      Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru);
+
+  // (2.b) Splat the first element to the inactive lanes.
+  Value *Needle0 = Builder.CreateExtractElement(LoadNeedle, uint64_t(0));
+  Value *Needle0Splat =
+      Builder.CreateVectorSplat(ElementCount::getScalable(VF), Needle0);
+  LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat);
+  LoadNeedle = Builder.CreateExtractVector(
+      FixedVectorType::get(CharTy, VF), LoadNeedle, ConstantInt::get(I64Ty, 0));
+
+  // (2.c) Test if there's a match.
+  Value *MatchPred = Builder.CreateIntrinsic(
+      Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
+      {LoadSearch, LoadNeedle, PredSearch});
+  Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+  Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}});
+
+  // (3) We found a match. Compute the index of its location and exit.
+  Builder.SetInsertPoint(BB3);
+  Value *MatchCnt = Builder.CreateIntrinsic(
+      Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
+      {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
+  Value *MatchVal = Builder.CreateGEP(CharTy, Search, MatchCnt);
+  Builder.CreateBr(ExitSucc);
+  DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
+
+  // (4) Check if we've reached the end of the needle array.
+  Builder.SetInsertPoint(BB4);
+  Value *NextNeedle = Builder.CreateGEP(CharTy, Needle, ConstVF);
+  Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5);
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
+
+  // (5) Check if we've reached the end of the search array.
+  Builder.SetInsertPoint(BB5);
+  Value *NextSearch = Builder.CreateGEP(CharTy, Search, ConstVF);
+  Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
+                       ExitFail);
+  DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
+                    {DominatorTree::Insert, BB5, ExitFail}});
+
+  // Set up the PHI's.
+  Search->addIncoming(SearchStart, Preheader);
+  Search->addIncoming(NextSearch, BB5);
+  Needle->addIncoming(NeedleStart, BB1);
+  Needle->addIncoming(NextNeedle, BB4);
+
+  if (VerifyLoops) {
+    OuterLoop->verifyLoop();
+    InnerLoop->verifyLoop();
+    if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
+      report_fatal_error("Loops must remain in LCSSA form!");
+  }
+
+  return MatchVal;
+}
+
+void LoopIdiomVectorize::transformFindFirstByte(
+    PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
+    BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd,
+    Value *NeedleStart, Value *NeedleEnd) {
+  // Insert the find first byte code at the end of the preheader block.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+  IRBuilder<> Builder(PHBranch);
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+  Value *MatchVal =
+      expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
+                          SearchStart, SearchEnd, NeedleStart, NeedleEnd);
+
+  // Add new incoming values with the result of the transformation to PHINodes
+  // of ExitSucc that use IndPhi.
+  for (auto *U : llvm::make_early_inc_range(IndPhi->users()))
+    if (auto *PN = dyn_cast<PHINode>(U); PN && PN->getParent() == ExitSucc)
+      PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+
+  if (VerifyLoops && CurLoop->getParentLoop()) {
+    CurLoop->getParentLoop()->verifyLoop();
+    if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
+      report_fatal_error("Loops must remain in LCSSA form!");
+  }
+}
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
new file mode 100644
index 000000000000000..a324896413d78c6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -0,0 +1,429 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+
+; Base case based on `libcxx/include/__algorithm/find_first_of.h':
+;   char* find_first_of(char *first, char *last, char *s_first, char *s_last) {
+;     for (; first != last; ++first)
+;       for (char *it = s_first; it != s_last; ++it)
+;         if (*first == *it)
+;           return first;
+;     return last;
+;   }
+define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[SEARCH]], i32 1, <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    br label %[[BB15:.*]]
+; CHECK:       [[BB15]]:
+; CHECK-NEXT:    [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[NEEDLE]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i8> [[TMP20]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP21]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = select <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> [[TMP20]], <vscale x 16 x i8> [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP22]], i64 0)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[TMP14]], <16 x i8> [[TMP23]], <vscale x 16 x i1> [[TMP13]])
+; CHECK-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP24]])
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
+; CHECK:       [[BB26]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP24]], i1 true)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SEARCH]], i64 [[TMP27]]
+; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
+; CHECK:       [[TMP29]]:
+; CHECK-NEXT:    [[TMP30]] = getelementptr i8, ptr [[NEEDLE]], i64 16
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
+; CHECK:       [[TMP32]]:
+; CHECK-NEXT:    [[TMP33]] = getelementptr i8, ptr [[SEARCH]], i64 16
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[BB35:.*]]
+; CHECK:       [[BB35]]:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; CHECK-NEXT:    br label %[[BB41:.*]]
+; CHECK:       [[BB38:.*]]:
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP42:%.*]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i8 [[TMP37]], [[TMP43]]
+; CHECK-NEXT:    br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]]
+; CHECK:       [[TMP45]]:
+; CHECK-NEXT:    [[TMP46]] = getelementptr inbounds i8, ptr [[TMP36]], i64 1
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]]
+; CHECK:       [[_LOOPEXIT:.*:]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ]
+; CHECK-NEXT:    br label %[[BB48]]
+; CHECK:       [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT:    br label %[[BB48]]
+; CHECK:       [[BB48]]:
+; CHECK-NEXT:    [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP49]]
+;
+  %5 = icmp eq ptr %0, %1
+  %6 = icmp eq ptr %2, %3
+  %7 = or i1 %5, %6
+  br i1 %7, label %21, label %8
+
+8:
+  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+  %10 = load i8, ptr %9, align 1
+  br label %14
+
+11:
+  %12 = getelementptr inbounds i8, ptr %15, i64 1
+  %13 = icmp eq ptr %12, %3
+  br i1 %13, label %18, label %14
+
+14:
+  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+  %16 = load i8, ptr %15, align 1
+  %17 = icmp eq i8 %10, %16
+  br i1 %17, label %21, label %11
+
+18:
+  %19 = getelementptr inbounds i8, ptr %9, i64 1
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %8
+
+21:
+  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+  ret ptr %22
+}
+
+; Same as @find_first_of_i8 but with i16.
+; This is accepted and generates a similar loop.
+define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i16(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[SEARCH]], i32 1, <vscale x 8 x i1> [[TMP13]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    br label %[[BB15:.*]]
+; CHECK:       [[BB15]]:
+; CHECK-NEXT:    [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[NEEDLE]], i32 1, <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i16> [[TMP20]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP21]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> [[TMP20]], <vscale x 8 x i16> [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[TMP22]], i64 0)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[TMP14]], <8 x i16> [[TMP23]], <vscale x 8 x i1> [[TMP13]])
+; CHECK-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[TMP24]])
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
+; CHECK:       [[BB26]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP24]], i1 true)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[SEARCH]], i64 [[TMP27]]
+; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
+; CHECK:       [[TMP29]]:
+; CHECK-NEXT:    [[TMP30]] = getelementptr i16, ptr [[NEEDLE]], i64 8
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
+; CHECK:       [[TMP32]]:
+; CHECK-NEXT:    [[TMP33]] = getelementptr i16, ptr [[SEARCH]], i64 8
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[BB35:.*]]
+; CHECK:       [[BB35]]:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP36]], align 1
+; CHECK-NEXT:    br label %[[BB41:.*]]
+; CHECK:       [[BB38:.*]]:
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[TMP42:%.*]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP42]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i16 [[TMP37]], [[TMP43]]
+; CHECK-NEXT:    br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]]
+; CHECK:       [[TMP45]]:
+; CHECK-NEXT:    [[TMP46]] = getelementptr inbounds i16, ptr [[TMP36]], i64 1
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]]
+; CHECK:       [[_LOOPEXIT:.*:]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ]
+; CHECK-NEXT:    br label %[[BB48]]
+; CHECK:       [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT:    br label %[[BB48]]
+; CHECK:       [[BB48]]:
+; CHECK-NEXT:    [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP49]]
+;
+  %5 = icmp eq ptr %0, %1
+  %6 = icmp eq ptr %2, %3
+  %7 = or i1 %5, %6
+  br i1 %7, label %21, label %8
+
+8:
+  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+  %10 = load i16, ptr %9, align 1
+  br label %14
+
+11:
+  %12 = getelementptr inbounds i16, ptr %15, i64 1
+  %13 = icmp eq ptr %12, %3
+  br i1 %13, label %18, label %14
+
+14:
+  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+  %16 = load i16, ptr %15, align 1
+  %17 = icmp eq i16 %10, %16
+  br i1 %17, label %21, label %11
+
+18:
+  %19 = getelementptr inbounds i16, ptr %9, i64 1
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %8
+
+21:
+  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+  ret ptr %22
+}
+
+; Same as @find_first_of_i8 but with `ne' comparison.
+; This is rejected for now, but should eventually be supported.
+define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; CHECK-LABEL: define ptr @find_first_not_of_i8(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label %[[BB14:.*]]
+; CHECK:       [[BB11:.*]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; CHECK:       [[TMP18]]:
+; CHECK-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; CHECK:       [[_LOOPEXIT:.*:]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP22]]
+;
+  %5 = icmp eq ptr %0, %1
+  %6 = icmp eq ptr %2, %3
+  %7 = or i1 %5, %6
+  br i1 %7, label %21, label %8
+
+8:
+  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+  %10 = load i8, ptr %9, align 1
+  br label %14
+
+11:
+  %12 = getelementptr inbounds i8, ptr %15, i64 1
+  %13 = icmp eq ptr %12, %3
+  br i1 %13, label %18, label %14
+
+14:
+  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+  %16 = load i8, ptr %15, align 1
+  %17 = icmp ne i8 %10, %16
+  br i1 %17, label %21, label %11
+
+18:
+  %19 = getelementptr inbounds i8, ptr %9, i64 1
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %8
+
+21:
+  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+  ret ptr %22
+}
+
+; This is the same as @find_first_of_i8 but without SVE2, which we require to
+; perform the conversion.
+define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) {
+; CHECK-LABEL: define ptr @find_first_of_i8_nosve2(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label %[[BB14:.*]]
+; CHECK:       [[BB11:.*]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; CHECK:       [[TMP18]]:
+; CHECK-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; CHECK:       [[_LOOPEXIT:.*:]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP22]]
+;
+  %5 = icmp eq ptr %0, %1
+  %6 = icmp eq ptr %2, %3
+  %7 = or i1 %5, %6
+  br i1 %7, label %21, label %8
+
+8:
+  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+  %10 = load i8, ptr %9, align 1
+  br label %14
+
+11:
+  %12 = getelementptr inbounds i8, ptr %15, i64 1
+  %13 = icmp eq ptr %12, %3
+  br i1 %13, label %18, label %14
+
+14:
+  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+  %16 = load i8, ptr %15, align 1
+  %17 = icmp eq i8 %10, %16
+  br i1 %17, label %21, label %11
+
+18:
+  %19 = getelementptr inbounds i8, ptr %9, i64 1
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %8
+
+21:
+  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+  ret ptr %22
+}
+
+; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest.
+; This isn't supported.
+define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8_outside_use(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    br label %[[BB14:.*]]
+; CHECK:       [[BB11:.*]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; CHECK:       [[TMP18]]:
+; CHECK-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; CHECK:       [[_LOOPEXIT:.*:]]
+; CHECK-NEXT:    [[DOTLCSSA3:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP15]], %[[BB14]] ]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA3]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi ptr [ [[TMP3]], [[TMP4]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP3]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP23]]
+;
+  %5 = icmp eq ptr %0, %1
+  %6 = icmp eq ptr %2, %3
+  %7 = or i1 %5, %6
+  br i1 %7, label %21, label %8
+
+8:
+  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+  %10 = load i8, ptr %9, align 1
+  br label %14
+
+11:
+  %12 = getelementptr inbounds i8, ptr %15, i64 1
+  %13 = icmp eq ptr %12, %3
+  br i1 %13, label %18, label %14
+
+14:
+  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+  %16 = load i8, ptr %15, align 1
+  %17 = icmp ne i8 %10, %16
+  br i1 %17, label %21, label %11
+
+18:
+  %19 = getelementptr inbounds i8, ptr %9, i64 1
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %8
+
+21:
+  %22 = phi ptr [ %1, %4 ], [  %9, %14 ], [ %1, %18 ]
+  %23 = phi ptr [ %3, %4 ], [ %15, %14 ], [ %3, %18 ]
+  ret ptr %23
+}
+
+attributes #0 = { "target-features"="+sve2" }

>From 2241058094a49c63a5912f5d72a0b69a9236fe02 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 11 Dec 2024 02:37:58 -0800
Subject: [PATCH 2/6] Add RUN line with -disable(...) and refactor tests

---
 llvm/test/CodeGen/AArch64/find-first-byte.ll | 219 +++++++++----------
 1 file changed, 103 insertions(+), 116 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
index a324896413d78c6..202ee0982d95ad2 100644
--- a/llvm/test/CodeGen/AArch64/find-first-byte.ll
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -disable-loop-idiom-vectorize-find-first-byte -S < %s | FileCheck -check-prefix=DISABLE %s
 
 ; Base case based on `libcxx/include/__algorithm/find_first_of.h':
 ;   char* find_first_of(char *first, char *last, char *s_first, char *s_last) {
@@ -20,20 +21,20 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
 ; CHECK:       [[BB9]]:
-; CHECK-NEXT:    [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64
+; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[SEARCH]], i32 1, <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[BB15:.*]]
 ; CHECK:       [[BB15]]:
-; CHECK-NEXT:    [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64
+; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[NEEDLE]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i8> [[TMP20]], i64 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP21]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -44,14 +45,14 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
 ; CHECK:       [[BB26]]:
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP24]], i1 true)
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SEARCH]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PSEARCH]], i64 [[TMP27]]
 ; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
 ; CHECK:       [[TMP29]]:
-; CHECK-NEXT:    [[TMP30]] = getelementptr i8, ptr [[NEEDLE]], i64 16
+; CHECK-NEXT:    [[TMP30]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
 ; CHECK:       [[TMP32]]:
-; CHECK-NEXT:    [[TMP33]] = getelementptr i8, ptr [[SEARCH]], i64 16
+; CHECK-NEXT:    [[TMP33]] = getelementptr i8, ptr [[PSEARCH]], i64 16
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -81,6 +82,40 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK:       [[BB48]]:
 ; CHECK-NEXT:    [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
 ; CHECK-NEXT:    ret ptr [[TMP49]]
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8(
+; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; DISABLE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; DISABLE-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; DISABLE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; DISABLE-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; DISABLE:       [[_PREHEADER:.*:]]
+; DISABLE-NEXT:    br label %[[BB8:.*]]
+; DISABLE:       [[BB8]]:
+; DISABLE-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; DISABLE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; DISABLE-NEXT:    br label %[[BB14:.*]]
+; DISABLE:       [[BB11:.*]]:
+; DISABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; DISABLE-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; DISABLE-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; DISABLE:       [[BB14]]:
+; DISABLE-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; DISABLE-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; DISABLE-NEXT:    [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]]
+; DISABLE-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; DISABLE:       [[TMP18]]:
+; DISABLE-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; DISABLE-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; DISABLE-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; DISABLE:       [[_LOOPEXIT:.*:]]
+; DISABLE-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; DISABLE-NEXT:    br label %[[BB21]]
+; DISABLE:       [[_LOOPEXIT1:.*:]]
+; DISABLE-NEXT:    br label %[[BB21]]
+; DISABLE:       [[BB21]]:
+; DISABLE-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; DISABLE-NEXT:    ret ptr [[TMP22]]
 ;
   %5 = icmp eq ptr %0, %1
   %6 = icmp eq ptr %2, %3
@@ -116,6 +151,7 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; Same as @find_first_of_i8 but with i16.
 ; This is accepted and generates a similar loop.
 define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+;
 ; CHECK-LABEL: define ptr @find_first_of_i16(
 ; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
@@ -126,20 +162,20 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
 ; CHECK:       [[BB9]]:
-; CHECK-NEXT:    [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64
+; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[SEARCH]], i32 1, <vscale x 8 x i1> [[TMP13]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[TMP13]], <vscale x 8 x i16> zeroinitializer)
 ; CHECK-NEXT:    br label %[[BB15:.*]]
 ; CHECK:       [[BB15]]:
-; CHECK-NEXT:    [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64
+; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[NEEDLE]], i32 1, <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i16> [[TMP20]], i64 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP21]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -150,14 +186,14 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
 ; CHECK:       [[BB26]]:
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP24]], i1 true)
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[SEARCH]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[PSEARCH]], i64 [[TMP27]]
 ; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
 ; CHECK:       [[TMP29]]:
-; CHECK-NEXT:    [[TMP30]] = getelementptr i16, ptr [[NEEDLE]], i64 8
+; CHECK-NEXT:    [[TMP30]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
 ; CHECK:       [[TMP32]]:
-; CHECK-NEXT:    [[TMP33]] = getelementptr i16, ptr [[SEARCH]], i64 8
+; CHECK-NEXT:    [[TMP33]] = getelementptr i16, ptr [[PSEARCH]], i64 8
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -187,6 +223,40 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK:       [[BB48]]:
 ; CHECK-NEXT:    [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
 ; CHECK-NEXT:    ret ptr [[TMP49]]
+;
+; DISABLE-LABEL: define ptr @find_first_of_i16(
+; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; DISABLE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; DISABLE-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; DISABLE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; DISABLE-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; DISABLE:       [[_PREHEADER:.*:]]
+; DISABLE-NEXT:    br label %[[BB8:.*]]
+; DISABLE:       [[BB8]]:
+; DISABLE-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; DISABLE-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP9]], align 1
+; DISABLE-NEXT:    br label %[[BB14:.*]]
+; DISABLE:       [[BB11:.*]]:
+; DISABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP15:%.*]], i64 1
+; DISABLE-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; DISABLE-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; DISABLE:       [[BB14]]:
+; DISABLE-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; DISABLE-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP15]], align 1
+; DISABLE-NEXT:    [[TMP17:%.*]] = icmp eq i16 [[TMP10]], [[TMP16]]
+; DISABLE-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; DISABLE:       [[TMP18]]:
+; DISABLE-NEXT:    [[TMP19]] = getelementptr inbounds i16, ptr [[TMP9]], i64 1
+; DISABLE-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; DISABLE-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; DISABLE:       [[_LOOPEXIT:.*:]]
+; DISABLE-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; DISABLE-NEXT:    br label %[[BB21]]
+; DISABLE:       [[_LOOPEXIT1:.*:]]
+; DISABLE-NEXT:    br label %[[BB21]]
+; DISABLE:       [[BB21]]:
+; DISABLE-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; DISABLE-NEXT:    ret ptr [[TMP22]]
 ;
   %5 = icmp eq ptr %0, %1
   %6 = icmp eq ptr %2, %3
@@ -219,42 +289,17 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
   ret ptr %22
 }
 
+; From here on we only test for the presence/absence of the intrinsic.
+; UTC_ARGS: --disable
+
 ; Same as @find_first_of_i8 but with `ne' comparison.
 ; This is rejected for now, but should eventually be supported.
 define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-LABEL: define ptr @find_first_not_of_i8(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[BB8:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    br label %[[BB14:.*]]
-; CHECK:       [[BB11:.*]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
-; CHECK:       [[BB14]]:
-; CHECK-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
-; CHECK:       [[TMP18]]:
-; CHECK-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
-; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[BB21]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP22]]
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_not_of_i8(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
   %5 = icmp eq ptr %0, %1
   %6 = icmp eq ptr %2, %3
@@ -291,38 +336,10 @@ define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; perform the conversion.
 define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) {
 ; CHECK-LABEL: define ptr @find_first_of_i8_nosve2(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[BB8:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    br label %[[BB14:.*]]
-; CHECK:       [[BB11:.*]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
-; CHECK:       [[BB14]]:
-; CHECK-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
-; CHECK:       [[TMP18]]:
-; CHECK-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
-; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[BB21]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP22]]
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8_nosve2(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
   %5 = icmp eq ptr %0, %1
   %6 = icmp eq ptr %2, %3
@@ -359,40 +376,10 @@ define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) {
 ; This isn't supported.
 define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-LABEL: define ptr @find_first_of_i8_outside_use(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[BB8:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    br label %[[BB14:.*]]
-; CHECK:       [[BB11:.*]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
-; CHECK:       [[BB14]]:
-; CHECK-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
-; CHECK:       [[TMP18]]:
-; CHECK-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
-; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA3:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP15]], %[[BB14]] ]
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[BB21]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA3]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi ptr [ [[TMP3]], [[TMP4]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP3]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP23]]
+; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
+;
+; DISABLE-LABEL: define ptr @find_first_of_i8_outside_use(
+; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
   %5 = icmp eq ptr %0, %1
   %6 = icmp eq ptr %2, %3

>From 042bda352a7f7fcf8b9b8fb943a8de639ce03e69 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 11 Dec 2024 09:19:07 -0800
Subject: [PATCH 3/6] Add page boundary checks and address other comments

---
 .../Vectorize/LoopIdiomVectorize.cpp          | 183 +++++++-----
 llvm/test/CodeGen/AArch64/find-first-byte.ll  | 269 ++++++++++--------
 2 files changed, 261 insertions(+), 191 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index a874dd9f8f181ba..7c42cdf056a9380 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -992,8 +992,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
 bool LoopIdiomVectorize::recognizeFindFirstByte() {
   // Currently the transformation only works on scalable vector types, although
   // there is no fundamental reason why it cannot be made to work for fixed
-  // vectors too.
-  if (!TTI->supportsScalableVectors() || DisableFindFirstByte)
+  // vectors. We also need to know the target's minimum page size in order to
+  // generate runtime memory checks to ensure the vector version won't fault.
+  if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
+      DisableFindFirstByte)
     return false;
 
   // Define some constants we need throughout.
@@ -1049,30 +1051,33 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
   //   %22 = icmp eq i8 %15, %21
   //   br i1 %22, label %ExitSucc, label %InnerBB
   BasicBlock *ExitSucc, *InnerBB;
-  Value *LoadA, *LoadB;
-  ICmpInst::Predicate MatchPred;
+  Value *LoadSearch, *LoadNeedle;
+  CmpPredicate MatchPred;
   if (!match(MatchBB->getTerminator(),
-             m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)),
+             m_Br(m_ICmp(MatchPred, m_Value(LoadSearch), m_Value(LoadNeedle)),
                   m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
-      MatchPred != ICmpInst::Predicate::ICMP_EQ ||
-      !InnerLoop->contains(InnerBB))
+      MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(InnerBB))
     return false;
 
   // We expect outside uses of `IndPhi' in ExitSucc (and only there).
   for (User *U : IndPhi->users())
-    if (!CurLoop->contains(cast<Instruction>(U)))
-      if (auto *PN = dyn_cast<PHINode>(U); !PN || PN->getParent() != ExitSucc)
+    if (!CurLoop->contains(cast<Instruction>(U))) {
+      auto *PN = dyn_cast<PHINode>(U);
+      if (!PN || PN->getParent() != ExitSucc)
         return false;
+    }
 
   // Match the loads and check they are simple.
-  Value *A, *B;
-  if (!match(LoadA, m_Load(m_Value(A))) || !cast<LoadInst>(LoadA)->isSimple() ||
-      !match(LoadB, m_Load(m_Value(B))) || !cast<LoadInst>(LoadB)->isSimple())
+  Value *Search, *Needle;
+  if (!match(LoadSearch, m_Load(m_Value(Search))) ||
+      !match(LoadNeedle, m_Load(m_Value(Needle))) ||
+      !cast<LoadInst>(LoadSearch)->isSimple() ||
+      !cast<LoadInst>(LoadNeedle)->isSimple())
     return false;
 
   // Check we are loading valid characters.
-  Type *CharTy = LoadA->getType();
-  if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy)
+  Type *CharTy = LoadSearch->getType();
+  if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy)
     return false;
 
   // Pick the vectorisation factor based on CharTy, work out the cost of the
@@ -1088,40 +1093,40 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
     return false;
 
   // The loads come from two PHIs, each with two incoming values.
-  PHINode *PNA = dyn_cast<PHINode>(A);
-  PHINode *PNB = dyn_cast<PHINode>(B);
-  if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB ||
-      PNB->getNumIncomingValues() != 2)
+  PHINode *PSearch = dyn_cast<PHINode>(Search);
+  PHINode *PNeedle = dyn_cast<PHINode>(Needle);
+  if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle ||
+      PNeedle->getNumIncomingValues() != 2)
     return false;
 
-  // One PHI comes from the outer loop (PNA), the other one from the inner loop
-  // (PNB). PNA effectively corresponds to IndPhi.
-  if (InnerLoop->contains(PNA))
-    std::swap(PNA, PNB);
-  if (PNA != &Header->front() || PNB != &MatchBB->front())
+  // One PHI comes from the outer loop (PSearch), the other one from the inner
+  // loop (PNeedle). PSearch effectively corresponds to IndPhi.
+  if (InnerLoop->contains(PSearch))
+    std::swap(PSearch, PNeedle);
+  if (PSearch != &Header->front() || PNeedle != &MatchBB->front())
     return false;
 
   // The incoming values of both PHI nodes should be a gep of 1.
-  Value *StartA = PNA->getIncomingValue(0);
-  Value *IndexA = PNA->getIncomingValue(1);
-  if (CurLoop->contains(PNA->getIncomingBlock(0)))
-    std::swap(StartA, IndexA);
+  Value *SearchStart = PSearch->getIncomingValue(0);
+  Value *SearchIndex = PSearch->getIncomingValue(1);
+  if (CurLoop->contains(PSearch->getIncomingBlock(0)))
+    std::swap(SearchStart, SearchIndex);
 
-  Value *StartB = PNB->getIncomingValue(0);
-  Value *IndexB = PNB->getIncomingValue(1);
-  if (InnerLoop->contains(PNB->getIncomingBlock(0)))
-    std::swap(StartB, IndexB);
+  Value *NeedleStart = PNeedle->getIncomingValue(0);
+  Value *NeedleIndex = PNeedle->getIncomingValue(1);
+  if (InnerLoop->contains(PNeedle->getIncomingBlock(0)))
+    std::swap(NeedleStart, NeedleIndex);
 
   // Match the GEPs.
-  if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) ||
-      !match(IndexB, m_GEP(m_Specific(PNB), m_One())))
+  if (!match(SearchIndex, m_GEP(m_Specific(PSearch), m_One())) ||
+      !match(NeedleIndex, m_GEP(m_Specific(PNeedle), m_One())))
     return false;
 
   // Check the GEPs result type matches `CharTy'.
-  GetElementPtrInst *GEPA = cast<GetElementPtrInst>(IndexA);
-  GetElementPtrInst *GEPB = cast<GetElementPtrInst>(IndexB);
-  if (GEPA->getResultElementType() != CharTy ||
-      GEPB->getResultElementType() != CharTy)
+  GetElementPtrInst *GEPSearch = cast<GetElementPtrInst>(SearchIndex);
+  GetElementPtrInst *GEPNeedle = cast<GetElementPtrInst>(NeedleIndex);
+  if (GEPSearch->getResultElementType() != CharTy ||
+      GEPNeedle->getResultElementType() != CharTy)
     return false;
 
   // InnerBB should increment the address of the needle pointer.
@@ -1131,11 +1136,12 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
   //   %18 = icmp eq ptr %17, %10
   //   br i1 %18, label %OuterBB, label %MatchBB
   BasicBlock *OuterBB;
-  Value *EndB;
+  Value *NeedleEnd;
   if (!match(InnerBB->getTerminator(),
-             m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPNeedle),
+                                 m_Value(NeedleEnd)),
                   m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
-      MatchPred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(OuterBB))
+      !CurLoop->contains(OuterBB))
     return false;
 
   // OuterBB should increment the address of the search element pointer.
@@ -1145,17 +1151,17 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
   //   %25 = icmp eq ptr %24, %6
   //   br i1 %25, label %ExitFail, label %Header
   BasicBlock *ExitFail;
-  Value *EndA;
+  Value *SearchEnd;
   if (!match(OuterBB->getTerminator(),
-             m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)),
-                  m_BasicBlock(ExitFail), m_Specific(Header))) ||
-      MatchPred != ICmpInst::Predicate::ICMP_EQ)
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPSearch),
+                                 m_Value(SearchEnd)),
+                  m_BasicBlock(ExitFail), m_Specific(Header))))
     return false;
 
   LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
 
-  transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, StartA, EndA,
-                         StartB, EndB);
+  transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart,
+                         SearchEnd, NeedleStart, NeedleEnd);
   return true;
 }
 
@@ -1187,6 +1193,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   // (I) Inner loop where we iterate over the elements of the needle array.
   //
   // Overall, the blocks do the following:
+  // (0) Check if the arrays can't cross page boundaries. If so go to (1),
+  //     otherwise fall back to the original scalar loop.
   // (1) Load the search array. Go to (2).
   // (2) (a) Load the needle array.
   //     (b) Splat the first element to the inactive lanes.
@@ -1196,8 +1204,9 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   //     (2), otherwise go to (5).
   // (5) Check if we've reached the end of the search array. If not loop back to
   //     (1), otherwise exit.
-  // Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the
-  // outer and inner loops, respectively.
+  // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to
+  // the outer and inner loops, respectively.
+  BasicBlock *BB0 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
   BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
   BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
   BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
@@ -1209,6 +1218,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   auto InnerLoop = LI->AllocateLoop();
 
   if (auto ParentLoop = CurLoop->getParentLoop()) {
+    ParentLoop->addBasicBlockToLoop(BB0, *LI);
     ParentLoop->addChildLoop(OuterLoop);
     ParentLoop->addBasicBlockToLoop(BB3, *LI);
   } else {
@@ -1224,24 +1234,46 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   InnerLoop->addBasicBlockToLoop(BB2, *LI);
   InnerLoop->addBasicBlockToLoop(BB4, *LI);
 
-  // Set a reference to the old scalar loop and create a predicate of VF
-  // elements.
-  Builder.SetInsertPoint(Preheader->getTerminator());
-  Value *Pred16 =
+  // Update the terminator added by SplitBlock to branch to the first block.
+  Preheader->getTerminator()->setSuccessor(0, BB0);
+  DTU.applyUpdates({{DominatorTree::Delete, Preheader, SPH},
+                    {DominatorTree::Insert, Preheader, BB0}});
+
+  // (0) Check if we could be crossing a page boundary; if so, fallback to the
+  // old scalar loops. Also create a predicate of VF elements to be used in the
+  // vector loops.
+  Builder.SetInsertPoint(BB0);
+  Value *ISearchStart = Builder.CreatePtrToInt(SearchStart, I64Ty);
+  Value *ISearchEnd = Builder.CreatePtrToInt(SearchEnd, I64Ty);
+  Value *INeedleStart = Builder.CreatePtrToInt(NeedleStart, I64Ty);
+  Value *INeedleEnd = Builder.CreatePtrToInt(NeedleEnd, I64Ty);
+  Value *PredVF =
       Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
                               {ConstantInt::get(I64Ty, 0), ConstVF});
-  Builder.CreateCondBr(Builder.getFalse(), SPH, BB1);
-  Preheader->getTerminator()->eraseFromParent();
-  DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}});
+
+  const uint64_t MinPageSize = TTI->getMinPageSize().value();
+  const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
+  Value *SearchStartPage = Builder.CreateLShr(ISearchStart, AddrShiftAmt);
+  Value *SearchEndPage = Builder.CreateLShr(ISearchEnd, AddrShiftAmt);
+  Value *NeedleStartPage = Builder.CreateLShr(INeedleStart, AddrShiftAmt);
+  Value *NeedleEndPage = Builder.CreateLShr(INeedleEnd, AddrShiftAmt);
+  Value *SearchPageCmp = Builder.CreateICmpNE(SearchStartPage, SearchEndPage);
+  Value *NeedlePageCmp = Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage);
+
+  Value *CombinedPageCmp = Builder.CreateOr(SearchPageCmp, NeedlePageCmp);
+  BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
+  CombinedPageBr->setMetadata(LLVMContext::MD_prof,
+                              MDBuilder(Ctx).createBranchWeights(10, 90));
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}});
 
   // (1) Load the search array and branch to the inner loop.
   Builder.SetInsertPoint(BB1);
   PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
-  Value *PredSearch =
-      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
-                              {Builder.CreatePointerCast(Search, I64Ty),
-                               Builder.CreatePointerCast(SearchEnd, I64Ty)});
-  PredSearch = Builder.CreateAnd(Pred16, PredSearch);
+  Value *PredSearch = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+      {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd});
+  PredSearch = Builder.CreateAnd(PredVF, PredSearch);
   Value *LoadSearch =
       Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru);
   Builder.CreateBr(BB2);
@@ -1252,11 +1284,10 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
 
   // (2.a) Load the needle array.
-  Value *PredNeedle =
-      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
-                              {Builder.CreatePointerCast(Needle, I64Ty),
-                               Builder.CreatePointerCast(NeedleEnd, I64Ty)});
-  PredNeedle = Builder.CreateAnd(Pred16, PredNeedle);
+  Value *PredNeedle = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+      {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd});
+  PredNeedle = Builder.CreateAnd(PredVF, PredNeedle);
   Value *LoadNeedle =
       Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru);
 
@@ -1279,10 +1310,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
 
   // (3) We found a match. Compute the index of its location and exit.
   Builder.SetInsertPoint(BB3);
+  PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1);
+  PHINode *MatchPredLCSSA = Builder.CreatePHI(MatchPred->getType(), 1);
   Value *MatchCnt = Builder.CreateIntrinsic(
       Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
-      {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
-  Value *MatchVal = Builder.CreateGEP(CharTy, Search, MatchCnt);
+      {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)});
+  Value *MatchVal = Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt);
   Builder.CreateBr(ExitSucc);
   DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
 
@@ -1301,11 +1334,14 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
                     {DominatorTree::Insert, BB5, ExitFail}});
 
-  // Set up the PHI's.
-  Search->addIncoming(SearchStart, Preheader);
+  // Set up the PHI nodes.
+  Search->addIncoming(SearchStart, BB0);
   Search->addIncoming(NextSearch, BB5);
   Needle->addIncoming(NeedleStart, BB1);
   Needle->addIncoming(NextNeedle, BB4);
+  // These are needed to retain LCSSA form.
+  MatchLCSSA->addIncoming(Search, BB2);
+  MatchPredLCSSA->addIncoming(MatchPred, BB2);
 
   if (VerifyLoops) {
     OuterLoop->verifyLoop();
@@ -1332,11 +1368,16 @@ void LoopIdiomVectorize::transformFindFirstByte(
       expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
                           SearchStart, SearchEnd, NeedleStart, NeedleEnd);
 
+  assert(PHBranch->isUnconditional() &&
+         "Expected preheader to terminate with an unconditional branch.");
+
   // Add new incoming values with the result of the transformation to PHINodes
   // of ExitSucc that use IndPhi.
-  for (auto *U : llvm::make_early_inc_range(IndPhi->users()))
-    if (auto *PN = dyn_cast<PHINode>(U); PN && PN->getParent() == ExitSucc)
+  for (auto *U : llvm::make_early_inc_range(IndPhi->users())) {
+    auto *PN = dyn_cast<PHINode>(U);
+    if (PN && PN->getParent() == ExitSucc)
       PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+  }
 
   if (VerifyLoops && CurLoop->getParentLoop()) {
     CurLoop->getParentLoop()->verifyLoop();
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
index 202ee0982d95ad2..b7d24c0012abad5 100644
--- a/llvm/test/CodeGen/AArch64/find-first-byte.ll
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -16,72 +16,85 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]]
 ; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
-; CHECK:       [[BB9]]:
-; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT:    br label %[[BB15:.*]]
-; CHECK:       [[BB15]]:
-; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]])
-; CHECK-NEXT:    [[TMP19:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i8> [[TMP20]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP21]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = select <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> [[TMP20]], <vscale x 16 x i8> [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP22]], i64 0)
-; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[TMP14]], <16 x i8> [[TMP23]], <vscale x 16 x i1> [[TMP13]])
-; CHECK-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP24]])
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP9]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP12]], 12
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP22]], i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 16 x i1> [[TMP13]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[TMP24]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    br label %[[BB26:.*]]
 ; CHECK:       [[BB26]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP24]], i1 true)
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PSEARCH]], i64 [[TMP27]]
+; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP12]])
+; CHECK-NEXT:    [[TMP29:%.*]] = and <vscale x 16 x i1> [[TMP13]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i8> [[TMP30]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP31]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = select <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> [[TMP30]], <vscale x 16 x i8> [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP32]], i64 0)
+; CHECK-NEXT:    [[TMP34:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[TMP25]], <16 x i8> [[TMP33]], <vscale x 16 x i1> [[TMP24]])
+; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP34]])
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]]
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <vscale x 16 x i1> [ [[TMP34]], %[[BB26]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP38]], i1 true)
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP37]], i64 [[TMP39]]
 ; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
-; CHECK:       [[TMP29]]:
-; CHECK-NEXT:    [[TMP30]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
-; CHECK:       [[TMP32]]:
-; CHECK-NEXT:    [[TMP33]] = getelementptr i8, ptr [[PSEARCH]], i64 16
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
+; CHECK:       [[TMP41]]:
+; CHECK-NEXT:    [[TMP42]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]]
+; CHECK:       [[TMP44]]:
+; CHECK-NEXT:    [[TMP45]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    br label %[[BB35:.*]]
-; CHECK:       [[BB35]]:
-; CHECK-NEXT:    [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
-; CHECK-NEXT:    br label %[[BB41:.*]]
-; CHECK:       [[BB38:.*]]:
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP42:%.*]], i64 1
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]]
-; CHECK:       [[BB41]]:
-; CHECK-NEXT:    [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i8 [[TMP37]], [[TMP43]]
-; CHECK-NEXT:    br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]]
-; CHECK:       [[TMP45]]:
-; CHECK-NEXT:    [[TMP46]] = getelementptr inbounds i8, ptr [[TMP36]], i64 1
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]]
+; CHECK-NEXT:    br label %[[BB47:.*]]
+; CHECK:       [[BB47]]:
+; CHECK-NEXT:    [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; CHECK-NEXT:    br label %[[BB53:.*]]
+; CHECK:       [[BB50:.*]]:
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP54:%.*]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]]
+; CHECK:       [[BB53]]:
+; CHECK-NEXT:    [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ]
+; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[TMP54]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i8 [[TMP49]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]]
+; CHECK:       [[TMP57]]:
+; CHECK-NEXT:    [[TMP58]] = getelementptr inbounds i8, ptr [[TMP48]], i64 1
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]]
 ; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ]
-; CHECK-NEXT:    br label %[[BB48]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ]
+; CHECK-NEXT:    br label %[[BB60]]
 ; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB48]]
-; CHECK:       [[BB48]]:
-; CHECK-NEXT:    [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP49]]
+; CHECK-NEXT:    br label %[[BB60]]
+; CHECK:       [[BB60]]:
+; CHECK-NEXT:    [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP61]]
 ;
 ; DISABLE-LABEL: define ptr @find_first_of_i8(
 ; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -157,72 +170,85 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]]
 ; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
-; CHECK:       [[BB9]]:
-; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[TMP13]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT:    br label %[[BB15:.*]]
-; CHECK:       [[BB15]]:
-; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]])
-; CHECK-NEXT:    [[TMP19:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i16> [[TMP20]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP21]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> [[TMP20]], <vscale x 8 x i16> [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[TMP22]], i64 0)
-; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[TMP14]], <8 x i16> [[TMP23]], <vscale x 8 x i1> [[TMP13]])
-; CHECK-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[TMP24]])
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP9]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP12]], 12
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP22]], i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 8 x i1> [[TMP13]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[TMP24]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    br label %[[BB26:.*]]
 ; CHECK:       [[BB26]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP24]], i1 true)
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[PSEARCH]], i64 [[TMP27]]
+; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP27]], i64 [[TMP12]])
+; CHECK-NEXT:    [[TMP29:%.*]] = and <vscale x 8 x i1> [[TMP13]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[TMP29]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 8 x i16> [[TMP30]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP31]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = select <vscale x 8 x i1> [[TMP29]], <vscale x 8 x i16> [[TMP30]], <vscale x 8 x i16> [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[TMP32]], i64 0)
+; CHECK-NEXT:    [[TMP34:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[TMP25]], <8 x i16> [[TMP33]], <vscale x 8 x i1> [[TMP24]])
+; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[TMP34]])
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]]
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <vscale x 8 x i1> [ [[TMP34]], %[[BB26]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP38]], i1 true)
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i16, ptr [[TMP37]], i64 [[TMP39]]
 ; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
-; CHECK:       [[TMP29]]:
-; CHECK-NEXT:    [[TMP30]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
-; CHECK:       [[TMP32]]:
-; CHECK-NEXT:    [[TMP33]] = getelementptr i16, ptr [[PSEARCH]], i64 8
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
+; CHECK:       [[TMP41]]:
+; CHECK-NEXT:    [[TMP42]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]]
+; CHECK:       [[TMP44]]:
+; CHECK-NEXT:    [[TMP45]] = getelementptr i16, ptr [[PSEARCH]], i64 8
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    br label %[[BB35:.*]]
-; CHECK:       [[BB35]]:
-; CHECK-NEXT:    [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP36]], align 1
-; CHECK-NEXT:    br label %[[BB41:.*]]
-; CHECK:       [[BB38:.*]]:
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[TMP42:%.*]], i64 1
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]]
-; CHECK:       [[BB41]]:
-; CHECK-NEXT:    [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP42]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i16 [[TMP37]], [[TMP43]]
-; CHECK-NEXT:    br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]]
-; CHECK:       [[TMP45]]:
-; CHECK-NEXT:    [[TMP46]] = getelementptr inbounds i16, ptr [[TMP36]], i64 1
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]]
+; CHECK-NEXT:    br label %[[BB47:.*]]
+; CHECK:       [[BB47]]:
+; CHECK-NEXT:    [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP48]], align 1
+; CHECK-NEXT:    br label %[[BB53:.*]]
+; CHECK:       [[BB50:.*]]:
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[TMP54:%.*]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]]
+; CHECK:       [[BB53]]:
+; CHECK-NEXT:    [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ]
+; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP54]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i16 [[TMP49]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]]
+; CHECK:       [[TMP57]]:
+; CHECK-NEXT:    [[TMP58]] = getelementptr inbounds i16, ptr [[TMP48]], i64 1
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]]
 ; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ]
-; CHECK-NEXT:    br label %[[BB48]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ]
+; CHECK-NEXT:    br label %[[BB60]]
 ; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB48]]
-; CHECK:       [[BB48]]:
-; CHECK-NEXT:    [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP49]]
+; CHECK-NEXT:    br label %[[BB60]]
+; CHECK:       [[BB60]]:
+; CHECK-NEXT:    [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[TMP61]]
 ;
 ; DISABLE-LABEL: define ptr @find_first_of_i16(
 ; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
@@ -414,3 +440,6 @@ define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 }
 
 attributes #0 = { "target-features"="+sve2" }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90}
+;.

>From ab3b6464990eb54216772c8baa963e9c514522c9 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 31 Jan 2025 09:08:29 +0000
Subject: [PATCH 4/6] Move tests to llvm/test/Transforms/LoopIdiom/AArch64

---
 .../{CodeGen => Transforms/LoopIdiom}/AArch64/find-first-byte.ll  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/{CodeGen => Transforms/LoopIdiom}/AArch64/find-first-byte.ll (100%)

diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
similarity index 100%
rename from llvm/test/CodeGen/AArch64/find-first-byte.ll
rename to llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll

>From 38860a86c6f444a446f1f00cd110784123f09f60 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 4 Feb 2025 13:12:22 +0000
Subject: [PATCH 5/6] Add names to blocks and variables

---
 .../Vectorize/LoopIdiomVectorize.cpp          | 106 ++-
 .../LoopIdiom/AArch64/find-first-byte.ll      | 777 +++++++++---------
 2 files changed, 458 insertions(+), 425 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 7c42cdf056a9380..44fe5ba3a0bfdaa 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -1184,7 +1184,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   // Split block in the original loop preheader.
   // SPH is the new preheader to the old scalar loop.
   BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
-                               nullptr, "scalar_ph");
+                               nullptr, "scalar_preheader");
 
   // Create the blocks that we're going to use.
   //
@@ -1206,12 +1206,17 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   //     (1), otherwise exit.
   // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to
   // the outer and inner loops, respectively.
-  BasicBlock *BB0 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
-  BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
-  BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
-  BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
-  BasicBlock *BB4 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
-  BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+  BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH);
+  BasicBlock *BB1 =
+      BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH);
+  BasicBlock *BB2 =
+      BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH);
+  BasicBlock *BB3 =
+      BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH);
+  BasicBlock *BB4 =
+      BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH);
+  BasicBlock *BB5 =
+      BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH);
 
   // Update LoopInfo with the new loops.
   auto OuterLoop = LI->AllocateLoop();
@@ -1243,24 +1248,35 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   // old scalar loops. Also create a predicate of VF elements to be used in the
   // vector loops.
   Builder.SetInsertPoint(BB0);
-  Value *ISearchStart = Builder.CreatePtrToInt(SearchStart, I64Ty);
-  Value *ISearchEnd = Builder.CreatePtrToInt(SearchEnd, I64Ty);
-  Value *INeedleStart = Builder.CreatePtrToInt(NeedleStart, I64Ty);
-  Value *INeedleEnd = Builder.CreatePtrToInt(NeedleEnd, I64Ty);
+  Value *ISearchStart =
+      Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int");
+  Value *ISearchEnd =
+      Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int");
+  Value *INeedleStart =
+      Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int");
+  Value *INeedleEnd =
+      Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int");
   Value *PredVF =
       Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
                               {ConstantInt::get(I64Ty, 0), ConstVF});
 
   const uint64_t MinPageSize = TTI->getMinPageSize().value();
   const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
-  Value *SearchStartPage = Builder.CreateLShr(ISearchStart, AddrShiftAmt);
-  Value *SearchEndPage = Builder.CreateLShr(ISearchEnd, AddrShiftAmt);
-  Value *NeedleStartPage = Builder.CreateLShr(INeedleStart, AddrShiftAmt);
-  Value *NeedleEndPage = Builder.CreateLShr(INeedleEnd, AddrShiftAmt);
-  Value *SearchPageCmp = Builder.CreateICmpNE(SearchStartPage, SearchEndPage);
-  Value *NeedlePageCmp = Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage);
-
-  Value *CombinedPageCmp = Builder.CreateOr(SearchPageCmp, NeedlePageCmp);
+  Value *SearchStartPage =
+      Builder.CreateLShr(ISearchStart, AddrShiftAmt, "search_start_page");
+  Value *SearchEndPage =
+      Builder.CreateLShr(ISearchEnd, AddrShiftAmt, "search_end_page");
+  Value *NeedleStartPage =
+      Builder.CreateLShr(INeedleStart, AddrShiftAmt, "needle_start_page");
+  Value *NeedleEndPage =
+      Builder.CreateLShr(INeedleEnd, AddrShiftAmt, "needle_end_page");
+  Value *SearchPageCmp =
+      Builder.CreateICmpNE(SearchStartPage, SearchEndPage, "search_page_cmp");
+  Value *NeedlePageCmp =
+      Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage, "needle_page_cmp");
+
+  Value *CombinedPageCmp =
+      Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp");
   BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
   CombinedPageBr->setMetadata(LLVMContext::MD_prof,
                               MDBuilder(Ctx).createBranchWeights(10, 90));
@@ -1272,10 +1288,11 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
   Value *PredSearch = Builder.CreateIntrinsic(
       Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
-      {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd});
-  PredSearch = Builder.CreateAnd(PredVF, PredSearch);
-  Value *LoadSearch =
-      Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru);
+      {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr,
+      "search_pred");
+  PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
+  Value *LoadSearch = Builder.CreateMaskedLoad(
+      CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
   Builder.CreateBr(BB2);
   DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
 
@@ -1286,23 +1303,27 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   // (2.a) Load the needle array.
   Value *PredNeedle = Builder.CreateIntrinsic(
       Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
-      {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd});
-  PredNeedle = Builder.CreateAnd(PredVF, PredNeedle);
-  Value *LoadNeedle =
-      Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru);
+      {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr,
+      "needle_pred");
+  PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked");
+  Value *LoadNeedle = Builder.CreateMaskedLoad(
+      CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec");
 
   // (2.b) Splat the first element to the inactive lanes.
-  Value *Needle0 = Builder.CreateExtractElement(LoadNeedle, uint64_t(0));
-  Value *Needle0Splat =
-      Builder.CreateVectorSplat(ElementCount::getScalable(VF), Needle0);
-  LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat);
-  LoadNeedle = Builder.CreateExtractVector(
-      FixedVectorType::get(CharTy, VF), LoadNeedle, ConstantInt::get(I64Ty, 0));
+  Value *Needle0 =
+      Builder.CreateExtractElement(LoadNeedle, uint64_t(0), "needle0");
+  Value *Needle0Splat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                                  Needle0, "needle0");
+  LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat,
+                                    "needle_splat");
+  LoadNeedle =
+      Builder.CreateExtractVector(FixedVectorType::get(CharTy, VF), LoadNeedle,
+                                  ConstantInt::get(I64Ty, 0), "needle_vec");
 
   // (2.c) Test if there's a match.
   Value *MatchPred = Builder.CreateIntrinsic(
       Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
-      {LoadSearch, LoadNeedle, PredSearch});
+      {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_pred");
   Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
   Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
   DTU.applyUpdates(
@@ -1310,25 +1331,30 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
 
   // (3) We found a match. Compute the index of its location and exit.
   Builder.SetInsertPoint(BB3);
-  PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1);
-  PHINode *MatchPredLCSSA = Builder.CreatePHI(MatchPred->getType(), 1);
+  PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start");
+  PHINode *MatchPredLCSSA =
+      Builder.CreatePHI(MatchPred->getType(), 1, "match_vec");
   Value *MatchCnt = Builder.CreateIntrinsic(
       Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
-      {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)});
-  Value *MatchVal = Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt);
+      {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr,
+      "match_idx");
+  Value *MatchVal =
+      Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res");
   Builder.CreateBr(ExitSucc);
   DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
 
   // (4) Check if we've reached the end of the needle array.
   Builder.SetInsertPoint(BB4);
-  Value *NextNeedle = Builder.CreateGEP(CharTy, Needle, ConstVF);
+  Value *NextNeedle =
+      Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
   Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5);
   DTU.applyUpdates(
       {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
 
   // (5) Check if we've reached the end of the search array.
   Builder.SetInsertPoint(BB5);
-  Value *NextSearch = Builder.CreateGEP(CharTy, Search, ConstVF);
+  Value *NextSearch =
+      Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec");
   Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
                        ExitFail);
   DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
index b7d24c0012abad5..92cde4d27f2c0cf 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -10,309 +10,314 @@
 ;           return first;
 ;     return last;
 ;   }
-define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
 ; CHECK-LABEL: define ptr @find_first_of_i8(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[BB8:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP9]], 12
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP10]], 12
-; CHECK-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP11]], 12
-; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP12]], 12
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       [[BB21]]:
-; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP22]], i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 16 x i1> [[TMP13]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[TMP24]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT:    br label %[[BB26:.*]]
-; CHECK:       [[BB26]]:
-; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT:    [[TMP28:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP12]])
-; CHECK-NEXT:    [[TMP29:%.*]] = and <vscale x 16 x i1> [[TMP13]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i8> [[TMP30]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP31]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP32:%.*]] = select <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> [[TMP30]], <vscale x 16 x i8> [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP32]], i64 0)
-; CHECK-NEXT:    [[TMP34:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[TMP25]], <16 x i8> [[TMP33]], <vscale x 16 x i1> [[TMP24]])
-; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP34]])
-; CHECK-NEXT:    br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]]
-; CHECK:       [[BB36]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = phi <vscale x 16 x i1> [ [[TMP34]], %[[BB26]] ]
-; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP38]], i1 true)
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP37]], i64 [[TMP39]]
-; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
-; CHECK:       [[TMP41]]:
-; CHECK-NEXT:    [[TMP42]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]]
-; CHECK:       [[TMP44]]:
-; CHECK-NEXT:    [[TMP45]] = getelementptr i8, ptr [[PSEARCH]], i64 16
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    br label %[[BB47:.*]]
-; CHECK:       [[BB47]]:
-; CHECK-NEXT:    [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
-; CHECK-NEXT:    br label %[[BB53:.*]]
-; CHECK:       [[BB50:.*]]:
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP54:%.*]], i64 1
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]]
-; CHECK:       [[BB53]]:
-; CHECK-NEXT:    [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ]
-; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[TMP54]], align 1
-; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i8 [[TMP49]], [[TMP55]]
-; CHECK-NEXT:    br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]]
-; CHECK:       [[TMP57]]:
-; CHECK-NEXT:    [[TMP58]] = getelementptr inbounds i8, ptr [[TMP48]], i64 1
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]]
-; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ]
-; CHECK-NEXT:    br label %[[BB60]]
-; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB60]]
-; CHECK:       [[BB60]]:
-; CHECK-NEXT:    [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP61]]
+; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; CHECK-NEXT:    [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; CHECK-NEXT:    [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; CHECK-NEXT:    br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; CHECK:       [[HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[MEM_CHECK:.*]]
+; CHECK:       [[MEM_CHECK]]:
+; CHECK-NEXT:    [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
+; CHECK-NEXT:    [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT:    [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
+; CHECK-NEXT:    [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT:    [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
+; CHECK-NEXT:    [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
+; CHECK-NEXT:    [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
+; CHECK-NEXT:    [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
+; CHECK-NEXT:    [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
+; CHECK-NEXT:    [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
+; CHECK-NEXT:    [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
+; CHECK-NEXT:    br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[FIND_FIRST_VEC_HEADER]]:
+; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
+; CHECK:       [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
+; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
+; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
+; CHECK-NEXT:    [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
+; CHECK-NEXT:    [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK:       [[CALCULATE_MATCH]]:
+; CHECK-NEXT:    [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT:    [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT:    [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
+; CHECK-NEXT:    [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT:    [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
+; CHECK:       [[SEARCH_CHECK_VEC]]:
+; CHECK-NEXT:    [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK:       [[SCALAR_PREHEADER]]:
+; CHECK-NEXT:    br label %[[HEADER:.*]]
+; CHECK:       [[HEADER]]:
+; CHECK-NEXT:    [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
+; CHECK-NEXT:    [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1
+; CHECK-NEXT:    br label %[[MATCH_CHECK:.*]]
+; CHECK:       [[NEEDLE_CHECK:.*]]:
+; CHECK-NEXT:    [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1
+; CHECK-NEXT:    [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; CHECK-NEXT:    br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; CHECK:       [[MATCH_CHECK]]:
+; CHECK-NEXT:    [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; CHECK-NEXT:    [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1
+; CHECK-NEXT:    [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; CHECK-NEXT:    br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]]
+; CHECK:       [[SEARCH_CHECK]]:
+; CHECK-NEXT:    [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1
+; CHECK-NEXT:    [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; CHECK-NEXT:    br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT_LOOPEXIT1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[RES]]
 ;
 ; DISABLE-LABEL: define ptr @find_first_of_i8(
-; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
-; DISABLE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; DISABLE-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; DISABLE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; DISABLE-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
-; DISABLE:       [[_PREHEADER:.*:]]
-; DISABLE-NEXT:    br label %[[BB8:.*]]
-; DISABLE:       [[BB8]]:
-; DISABLE-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; DISABLE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; DISABLE-NEXT:    br label %[[BB14:.*]]
-; DISABLE:       [[BB11:.*]]:
-; DISABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
-; DISABLE-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
-; DISABLE-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
-; DISABLE:       [[BB14]]:
-; DISABLE-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
-; DISABLE-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
-; DISABLE-NEXT:    [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]]
-; DISABLE-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
-; DISABLE:       [[TMP18]]:
-; DISABLE-NEXT:    [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
-; DISABLE-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
-; DISABLE-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
-; DISABLE:       [[_LOOPEXIT:.*:]]
-; DISABLE-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
-; DISABLE-NEXT:    br label %[[BB21]]
-; DISABLE:       [[_LOOPEXIT1:.*:]]
-; DISABLE-NEXT:    br label %[[BB21]]
-; DISABLE:       [[BB21]]:
-; DISABLE-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; DISABLE-NEXT:    ret ptr [[TMP22]]
+; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] {
+; DISABLE-NEXT:  [[ENTRY:.*]]:
+; DISABLE-NEXT:    [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; DISABLE-NEXT:    [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; DISABLE-NEXT:    [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; DISABLE-NEXT:    br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; DISABLE:       [[HEADER_PREHEADER]]:
+; DISABLE-NEXT:    br label %[[HEADER:.*]]
+; DISABLE:       [[HEADER]]:
+; DISABLE-NEXT:    [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
+; DISABLE-NEXT:    [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1
+; DISABLE-NEXT:    br label %[[MATCH_CHECK:.*]]
+; DISABLE:       [[NEEDLE_CHECK:.*]]:
+; DISABLE-NEXT:    [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1
+; DISABLE-NEXT:    [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; DISABLE-NEXT:    br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; DISABLE:       [[MATCH_CHECK]]:
+; DISABLE-NEXT:    [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; DISABLE-NEXT:    [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1
+; DISABLE-NEXT:    [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; DISABLE-NEXT:    br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]]
+; DISABLE:       [[SEARCH_CHECK]]:
+; DISABLE-NEXT:    [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1
+; DISABLE-NEXT:    [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; DISABLE-NEXT:    br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]]
+; DISABLE:       [[EXIT_LOOPEXIT]]:
+; DISABLE-NEXT:    [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
+; DISABLE-NEXT:    br label %[[EXIT]]
+; DISABLE:       [[EXIT_LOOPEXIT1]]:
+; DISABLE-NEXT:    br label %[[EXIT]]
+; DISABLE:       [[EXIT]]:
+; DISABLE-NEXT:    [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; DISABLE-NEXT:    ret ptr [[RES]]
 ;
-  %5 = icmp eq ptr %0, %1
-  %6 = icmp eq ptr %2, %3
-  %7 = or i1 %5, %6
-  br i1 %7, label %21, label %8
-
-8:
-  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
-  %10 = load i8, ptr %9, align 1
-  br label %14
-
-11:
-  %12 = getelementptr inbounds i8, ptr %15, i64 1
-  %13 = icmp eq ptr %12, %3
-  br i1 %13, label %18, label %14
-
-14:
-  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
-  %16 = load i8, ptr %15, align 1
-  %17 = icmp eq i8 %10, %16
-  br i1 %17, label %21, label %11
-
-18:
-  %19 = getelementptr inbounds i8, ptr %9, i64 1
-  %20 = icmp eq ptr %19, %1
-  br i1 %20, label %21, label %8
-
-21:
-  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
-  ret ptr %22
+entry:
+  %search_test = icmp eq ptr %search_start, %search_end
+  %needle_test = icmp eq ptr %needle_start, %needle_end
+  %combined_test = or i1 %search_test, %needle_test
+  br i1 %combined_test, label %exit, label %header
+
+header:
+  %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+  %search_load = load i8, ptr %search_ptr, align 1
+  br label %match_check
+
+needle_check:
+  %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+  %needle_cmp = icmp eq ptr %needle_next, %needle_end
+  br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+  %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+  %needle_load = load i8, ptr %needle_ptr, align 1
+  %match_cmp = icmp eq i8 %search_load, %needle_load
+  br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+  %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+  %search_cmp = icmp eq ptr %search_next, %search_end
+  br i1 %search_cmp, label %exit, label %header
+
+exit:
+  %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+  ret ptr %res
 }
 
-; Same as @find_first_of_i8 but with i16.
+; Equivalent to @find_first_of_i8 but with i16.
 ; This is accepted and generates a similar loop.
-define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
-;
+define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
 ; CHECK-LABEL: define ptr @find_first_of_i16(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[BB8:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP9]], 12
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP10]], 12
-; CHECK-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP11]], 12
-; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP12]], 12
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0]]
-; CHECK:       [[BB21]]:
-; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP22]], i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 8 x i1> [[TMP13]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[TMP24]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT:    br label %[[BB26:.*]]
-; CHECK:       [[BB26]]:
-; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT:    [[TMP28:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP27]], i64 [[TMP12]])
-; CHECK-NEXT:    [[TMP29:%.*]] = and <vscale x 8 x i1> [[TMP13]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[TMP29]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 8 x i16> [[TMP30]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP31]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP32:%.*]] = select <vscale x 8 x i1> [[TMP29]], <vscale x 8 x i16> [[TMP30]], <vscale x 8 x i16> [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[TMP32]], i64 0)
-; CHECK-NEXT:    [[TMP34:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[TMP25]], <8 x i16> [[TMP33]], <vscale x 8 x i1> [[TMP24]])
-; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[TMP34]])
-; CHECK-NEXT:    br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]]
-; CHECK:       [[BB36]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = phi <vscale x 8 x i1> [ [[TMP34]], %[[BB26]] ]
-; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP38]], i1 true)
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i16, ptr [[TMP37]], i64 [[TMP39]]
-; CHECK-NEXT:    br label %[[DOTLOOPEXIT:.*]]
-; CHECK:       [[TMP41]]:
-; CHECK-NEXT:    [[TMP42]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]]
-; CHECK:       [[TMP44]]:
-; CHECK-NEXT:    [[TMP45]] = getelementptr i16, ptr [[PSEARCH]], i64 8
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    br label %[[BB47:.*]]
-; CHECK:       [[BB47]]:
-; CHECK-NEXT:    [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP48]], align 1
-; CHECK-NEXT:    br label %[[BB53:.*]]
-; CHECK:       [[BB50:.*]]:
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[TMP54:%.*]], i64 1
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]]
-; CHECK:       [[BB53]]:
-; CHECK-NEXT:    [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ]
-; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP54]], align 1
-; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i16 [[TMP49]], [[TMP55]]
-; CHECK-NEXT:    br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]]
-; CHECK:       [[TMP57]]:
-; CHECK-NEXT:    [[TMP58]] = getelementptr inbounds i16, ptr [[TMP48]], i64 1
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]]
-; CHECK:       [[_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ]
-; CHECK-NEXT:    br label %[[BB60]]
-; CHECK:       [[_LOOPEXIT1:.*:]]
-; CHECK-NEXT:    br label %[[BB60]]
-; CHECK:       [[BB60]]:
-; CHECK-NEXT:    [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; CHECK-NEXT:    ret ptr [[TMP61]]
+; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; CHECK-NEXT:    [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; CHECK-NEXT:    [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; CHECK-NEXT:    br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; CHECK:       [[HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[MEM_CHECK:.*]]
+; CHECK:       [[MEM_CHECK]]:
+; CHECK-NEXT:    [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
+; CHECK-NEXT:    [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT:    [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
+; CHECK-NEXT:    [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
+; CHECK-NEXT:    [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
+; CHECK-NEXT:    [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
+; CHECK-NEXT:    [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
+; CHECK-NEXT:    [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
+; CHECK-NEXT:    [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
+; CHECK-NEXT:    [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
+; CHECK-NEXT:    br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
+; CHECK:       [[FIND_FIRST_VEC_HEADER]]:
+; CHECK-NEXT:    [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
+; CHECK:       [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], i64 0
+; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[NEEDLE0]], i64 0
+; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[NEEDLE_SPLAT:%.*]] = select <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], <vscale x 8 x i16> [[NEEDLE0_SPLAT]]
+; CHECK-NEXT:    [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[NEEDLE_SPLAT]], i64 0)
+; CHECK-NEXT:    [[MATCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], <vscale x 8 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[MATCH_PRED]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK:       [[CALCULATE_MATCH]]:
+; CHECK-NEXT:    [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT:    [[MATCH_VEC:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT:    [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[MATCH_VEC]], i1 true)
+; CHECK-NEXT:    [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT:    [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
+; CHECK:       [[SEARCH_CHECK_VEC]]:
+; CHECK-NEXT:    [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK:       [[SCALAR_PREHEADER]]:
+; CHECK-NEXT:    br label %[[HEADER:.*]]
+; CHECK:       [[HEADER]]:
+; CHECK-NEXT:    [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
+; CHECK-NEXT:    [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1
+; CHECK-NEXT:    br label %[[MATCH_CHECK:.*]]
+; CHECK:       [[NEEDLE_CHECK:.*]]:
+; CHECK-NEXT:    [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1
+; CHECK-NEXT:    [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; CHECK-NEXT:    br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; CHECK:       [[MATCH_CHECK]]:
+; CHECK-NEXT:    [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; CHECK-NEXT:    [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1
+; CHECK-NEXT:    [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; CHECK-NEXT:    br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]]
+; CHECK:       [[SEARCH_CHECK]]:
+; CHECK-NEXT:    [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1
+; CHECK-NEXT:    [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; CHECK-NEXT:    br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT_LOOPEXIT1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    ret ptr [[RES]]
 ;
 ; DISABLE-LABEL: define ptr @find_first_of_i16(
-; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; DISABLE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; DISABLE-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
-; DISABLE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; DISABLE-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
-; DISABLE:       [[_PREHEADER:.*:]]
-; DISABLE-NEXT:    br label %[[BB8:.*]]
-; DISABLE:       [[BB8]]:
-; DISABLE-NEXT:    [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; DISABLE-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP9]], align 1
-; DISABLE-NEXT:    br label %[[BB14:.*]]
-; DISABLE:       [[BB11:.*]]:
-; DISABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP15:%.*]], i64 1
-; DISABLE-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
-; DISABLE-NEXT:    br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
-; DISABLE:       [[BB14]]:
-; DISABLE-NEXT:    [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
-; DISABLE-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP15]], align 1
-; DISABLE-NEXT:    [[TMP17:%.*]] = icmp eq i16 [[TMP10]], [[TMP16]]
-; DISABLE-NEXT:    br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
-; DISABLE:       [[TMP18]]:
-; DISABLE-NEXT:    [[TMP19]] = getelementptr inbounds i16, ptr [[TMP9]], i64 1
-; DISABLE-NEXT:    [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
-; DISABLE-NEXT:    br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
-; DISABLE:       [[_LOOPEXIT:.*:]]
-; DISABLE-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
-; DISABLE-NEXT:    br label %[[BB21]]
-; DISABLE:       [[_LOOPEXIT1:.*:]]
-; DISABLE-NEXT:    br label %[[BB21]]
-; DISABLE:       [[BB21]]:
-; DISABLE-NEXT:    [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
-; DISABLE-NEXT:    ret ptr [[TMP22]]
+; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
+; DISABLE-NEXT:  [[ENTRY:.*]]:
+; DISABLE-NEXT:    [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]]
+; DISABLE-NEXT:    [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]]
+; DISABLE-NEXT:    [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
+; DISABLE-NEXT:    br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
+; DISABLE:       [[HEADER_PREHEADER]]:
+; DISABLE-NEXT:    br label %[[HEADER:.*]]
+; DISABLE:       [[HEADER]]:
+; DISABLE-NEXT:    [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
+; DISABLE-NEXT:    [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1
+; DISABLE-NEXT:    br label %[[MATCH_CHECK:.*]]
+; DISABLE:       [[NEEDLE_CHECK:.*]]:
+; DISABLE-NEXT:    [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1
+; DISABLE-NEXT:    [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]]
+; DISABLE-NEXT:    br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]]
+; DISABLE:       [[MATCH_CHECK]]:
+; DISABLE-NEXT:    [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
+; DISABLE-NEXT:    [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1
+; DISABLE-NEXT:    [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
+; DISABLE-NEXT:    br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]]
+; DISABLE:       [[SEARCH_CHECK]]:
+; DISABLE-NEXT:    [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1
+; DISABLE-NEXT:    [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
+; DISABLE-NEXT:    br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]]
+; DISABLE:       [[EXIT_LOOPEXIT]]:
+; DISABLE-NEXT:    [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
+; DISABLE-NEXT:    br label %[[EXIT]]
+; DISABLE:       [[EXIT_LOOPEXIT1]]:
+; DISABLE-NEXT:    br label %[[EXIT]]
+; DISABLE:       [[EXIT]]:
+; DISABLE-NEXT:    [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ]
+; DISABLE-NEXT:    ret ptr [[RES]]
 ;
-  %5 = icmp eq ptr %0, %1
-  %6 = icmp eq ptr %2, %3
-  %7 = or i1 %5, %6
-  br i1 %7, label %21, label %8
-
-8:
-  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
-  %10 = load i16, ptr %9, align 1
-  br label %14
-
-11:
-  %12 = getelementptr inbounds i16, ptr %15, i64 1
-  %13 = icmp eq ptr %12, %3
-  br i1 %13, label %18, label %14
-
-14:
-  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
-  %16 = load i16, ptr %15, align 1
-  %17 = icmp eq i16 %10, %16
-  br i1 %17, label %21, label %11
-
-18:
-  %19 = getelementptr inbounds i16, ptr %9, i64 1
-  %20 = icmp eq ptr %19, %1
-  br i1 %20, label %21, label %8
-
-21:
-  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
-  ret ptr %22
+entry:
+  %search_test = icmp eq ptr %search_start, %search_end
+  %needle_test = icmp eq ptr %needle_start, %needle_end
+  %combined_test = or i1 %search_test, %needle_test
+  br i1 %combined_test, label %exit, label %header
+
+header:
+  %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+  %search_load = load i16, ptr %search_ptr, align 1
+  br label %match_check
+
+needle_check:
+  %needle_next = getelementptr inbounds i16, ptr %needle_ptr, i64 1
+  %needle_cmp = icmp eq ptr %needle_next, %needle_end
+  br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+  %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+  %needle_load = load i16, ptr %needle_ptr, align 1
+  %match_cmp = icmp eq i16 %search_load, %needle_load
+  br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+  %search_next = getelementptr inbounds i16, ptr %search_ptr, i64 1
+  %search_cmp = icmp eq ptr %search_next, %search_end
+  br i1 %search_cmp, label %exit, label %header
+
+exit:
+  %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+  ret ptr %res
 }
 
 ; From here on we only test for the presence/absence of the intrinsic.
@@ -320,126 +325,128 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
 
 ; Same as @find_first_of_i8 but with `ne' comparison.
 ; This is rejected for now, but should eventually be supported.
-define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+define ptr @find_first_not_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
 ; CHECK-LABEL: define ptr @find_first_not_of_i8(
 ; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
 ; DISABLE-LABEL: define ptr @find_first_not_of_i8(
 ; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
-  %5 = icmp eq ptr %0, %1
-  %6 = icmp eq ptr %2, %3
-  %7 = or i1 %5, %6
-  br i1 %7, label %21, label %8
-
-8:
-  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
-  %10 = load i8, ptr %9, align 1
-  br label %14
-
-11:
-  %12 = getelementptr inbounds i8, ptr %15, i64 1
-  %13 = icmp eq ptr %12, %3
-  br i1 %13, label %18, label %14
-
-14:
-  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
-  %16 = load i8, ptr %15, align 1
-  %17 = icmp ne i8 %10, %16
-  br i1 %17, label %21, label %11
-
-18:
-  %19 = getelementptr inbounds i8, ptr %9, i64 1
-  %20 = icmp eq ptr %19, %1
-  br i1 %20, label %21, label %8
-
-21:
-  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
-  ret ptr %22
+entry:
+  %search_test = icmp eq ptr %search_start, %search_end
+  %needle_test = icmp eq ptr %needle_start, %needle_end
+  %combined_test = or i1 %search_test, %needle_test
+  br i1 %combined_test, label %exit, label %header
+
+header:
+  %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+  %search_load = load i8, ptr %search_ptr, align 1
+  br label %match_check
+
+needle_check:
+  %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+  %needle_cmp = icmp eq ptr %needle_next, %needle_end
+  br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+  %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+  %needle_load = load i8, ptr %needle_ptr, align 1
+  %match_cmp = icmp ne i8 %search_load, %needle_load
+  br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+  %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+  %search_cmp = icmp eq ptr %search_next, %search_end
+  br i1 %search_cmp, label %exit, label %header
+
+exit:
+  %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+  ret ptr %res
 }
 
 ; This is the same as @find_first_of_i8 but without SVE2, which we require to
 ; perform the conversion.
-define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) {
+define ptr @find_first_of_i8_nosve2(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) {
 ; CHECK-LABEL: define ptr @find_first_of_i8_nosve2(
 ; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
 ; DISABLE-LABEL: define ptr @find_first_of_i8_nosve2(
 ; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
-  %5 = icmp eq ptr %0, %1
-  %6 = icmp eq ptr %2, %3
-  %7 = or i1 %5, %6
-  br i1 %7, label %21, label %8
-
-8:
-  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
-  %10 = load i8, ptr %9, align 1
-  br label %14
-
-11:
-  %12 = getelementptr inbounds i8, ptr %15, i64 1
-  %13 = icmp eq ptr %12, %3
-  br i1 %13, label %18, label %14
-
-14:
-  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
-  %16 = load i8, ptr %15, align 1
-  %17 = icmp eq i8 %10, %16
-  br i1 %17, label %21, label %11
-
-18:
-  %19 = getelementptr inbounds i8, ptr %9, i64 1
-  %20 = icmp eq ptr %19, %1
-  br i1 %20, label %21, label %8
-
-21:
-  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
-  ret ptr %22
+entry:
+  %search_test = icmp eq ptr %search_start, %search_end
+  %needle_test = icmp eq ptr %needle_start, %needle_end
+  %combined_test = or i1 %search_test, %needle_test
+  br i1 %combined_test, label %exit, label %header
+
+header:
+  %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+  %search_load = load i8, ptr %search_ptr, align 1
+  br label %match_check
+
+needle_check:
+  %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+  %needle_cmp = icmp eq ptr %needle_next, %needle_end
+  br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+  %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+  %needle_load = load i8, ptr %needle_ptr, align 1
+  %match_cmp = icmp eq i8 %search_load, %needle_load
+  br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+  %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+  %search_cmp = icmp eq ptr %search_next, %search_end
+  br i1 %search_cmp, label %exit, label %header
+
+exit:
+  %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+  ret ptr %res
 }
 
 ; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest.
 ; This isn't supported.
-define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+define ptr @find_first_of_i8_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
 ; CHECK-LABEL: define ptr @find_first_of_i8_outside_use(
 ; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
 ; DISABLE-LABEL: define ptr @find_first_of_i8_outside_use(
 ; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}}
 ;
-  %5 = icmp eq ptr %0, %1
-  %6 = icmp eq ptr %2, %3
-  %7 = or i1 %5, %6
-  br i1 %7, label %21, label %8
-
-8:
-  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
-  %10 = load i8, ptr %9, align 1
-  br label %14
-
-11:
-  %12 = getelementptr inbounds i8, ptr %15, i64 1
-  %13 = icmp eq ptr %12, %3
-  br i1 %13, label %18, label %14
-
-14:
-  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
-  %16 = load i8, ptr %15, align 1
-  %17 = icmp ne i8 %10, %16
-  br i1 %17, label %21, label %11
-
-18:
-  %19 = getelementptr inbounds i8, ptr %9, i64 1
-  %20 = icmp eq ptr %19, %1
-  br i1 %20, label %21, label %8
-
-21:
-  %22 = phi ptr [ %1, %4 ], [  %9, %14 ], [ %1, %18 ]
-  %23 = phi ptr [ %3, %4 ], [ %15, %14 ], [ %3, %18 ]
-  ret ptr %23
+entry:
+  %search_test = icmp eq ptr %search_start, %search_end
+  %needle_test = icmp eq ptr %needle_start, %needle_end
+  %combined_test = or i1 %search_test, %needle_test
+  br i1 %combined_test, label %exit, label %header
+
+header:
+  %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ]
+  %search_load = load i8, ptr %search_ptr, align 1
+  br label %match_check
+
+needle_check:
+  %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1
+  %needle_cmp = icmp eq ptr %needle_next, %needle_end
+  br i1 %needle_cmp, label %search_check, label %match_check
+
+match_check:
+  %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ]
+  %needle_load = load i8, ptr %needle_ptr, align 1
+  %match_cmp = icmp eq i8 %search_load, %needle_load
+  br i1 %match_cmp, label %exit, label %needle_check
+
+search_check:
+  %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1
+  %search_cmp = icmp eq ptr %search_next, %search_end
+  br i1 %search_cmp, label %exit, label %header
+
+exit:
+  %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ]
+  %use = phi ptr [ %needle_end, %entry ], [ %needle_ptr, %match_check ], [ %needle_end, %search_check ]
+  ret ptr %res
 }
 
 attributes #0 = { "target-features"="+sve2" }
-;.
+
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90}
-;.

>From db2fbc69febbb073ab787385b78936c43c313105 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 4 Feb 2025 13:56:25 +0000
Subject: [PATCH 6/6] Add checks for loop invariance

---
 llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 44fe5ba3a0bfdaa..38eedf92d8c2464 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -1158,6 +1158,12 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
                   m_BasicBlock(ExitFail), m_Specific(Header))))
     return false;
 
+  if (!CurLoop->isLoopInvariant(SearchStart) ||
+      !CurLoop->isLoopInvariant(SearchEnd) ||
+      !CurLoop->isLoopInvariant(NeedleStart) ||
+      !CurLoop->isLoopInvariant(NeedleEnd))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
 
   transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart,



More information about the llvm-commits mailing list