[llvm] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (PR #101976)

Mon Aug 5 06:27:56 PDT 2024

https://github.com/rj-jesus created https://github.com/llvm/llvm-project/pull/101976

This patch adds a new loop to LoopIdiomVectorizePass, enabling it to
recognise and vectorise loops such as:
```cpp
template<class InputIt, class ForwardIt>
InputIt find_first_of(InputIt first, InputIt last,
                      ForwardIt s_first, ForwardIt s_last)
{
  for (; first != last; ++first)
    for (ForwardIt it = s_first; it != s_last; ++it)
      if (*first == *it)
        return first;
  return last;
}
```

These loops match the C++ standard library function std::find_first_of.

The loops are vectorised using `@experimental.vector.match` in #101974.

>From ba6e9b594549ce7972f63af1ba8d8b434641cdf3 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 19 Jul 2024 16:10:51 +0100
Subject: [PATCH 1/2] [AArch64] Add @llvm.experimental.vector.match

This patch introduces an experimental intrinsic for matching the
elements of one vector against the elements of another.

For AArch64 targets that support SVE2, it lowers to a MATCH instruction
for supported fixed and scalar types.
---
 llvm/docs/LangRef.rst                         | 45 +++++++++++++++
 .../llvm/Analysis/TargetTransformInfo.h       |  9 +++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/include/llvm/IR/Intrinsics.td            | 10 ++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  9 +++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 46 +++++++++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 12 ++++
 .../AArch64/AArch64TargetTransformInfo.h      |  2 +
 .../AArch64/intrinsic-vector-match-sve2.ll    | 57 +++++++++++++++++++
 10 files changed, 197 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b17e3c828ed3d..dd9851d1af078 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19637,6 +19637,51 @@ are undefined.
     }
 
 
+'``llvm.experimental.vector.match.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. Support for specific vector types is target
+dependent.
+
+::
+
+    declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<n> x <ty>> %op2, <<n> x i1> %mask, i32 <segsize>)
+    declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <vscale x <n> x <ty>> %op2, <vscale x <n> x i1> %mask, i32 <segsize>)
+
+Overview:
+"""""""""
+
+Find elements of the first argument matching any elements of the second.
+
+Arguments:
+""""""""""
+
+The first argument is the search vector, the second argument is the vector of
+elements we are searching for (i.e. for which we consider a match successful),
+and the third argument is a mask that controls which elements of the first
+argument are active. The fourth argument is an immediate that sets the segment
+size for the search window.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.match``' intrinsic compares each element in the
+first argument against potentially several elements of the second, placing
+``1`` in the corresponding element of the output vector if any comparison is
+successful, and ``0`` otherwise. Inactive elements in the mask are set to ``0``
+in the output. The segment size controls the number of elements of the second
+argument that are compared against.
+
+For example, for vectors with 16 elements, if ``segsize = 16`` then each
+element of the first argument is compared against all 16 elements of the second
+argument; but if ``segsize = 4``, then each of the first four elements of the
+first argument is compared against the first four elements of the second
+argument, each of the second four elements of the first argument is compared
+against the second four elements of the second argument, and so forth.
+
 Matrix Intrinsics
 -----------------
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 38e8b9da21397..786c13a177ccf 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1746,6 +1746,10 @@ class TargetTransformInfo {
   bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                              Align Alignment) const;
 
+  /// \returns Returns true if the target supports vector match operations for
+  /// the vector type `VT` using a segment size of `SegSize`.
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
   struct VPLegalization {
     enum VPTransform {
       // keep the predicating parameter
@@ -2184,6 +2188,7 @@ class TargetTransformInfo::Concept {
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                                      Align Alignment) const = 0;
+  virtual bool hasVectorMatch(VectorType *VT, unsigned SegSize) const = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2952,6 +2957,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
   }
 
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const override {
+    return Impl.hasVectorMatch(VT, SegSize);
+  }
+
   VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
     return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d208a710bb27f..36621861ab8c8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -958,6 +958,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }
+
   TargetTransformInfo::VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const {
     return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index b4e758136b39f..f6d77aa596f60 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1892,6 +1892,16 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
                              [ IntrArgMemOnly ]>;
 
+// Experimental match
+def int_experimental_vector_match : DefaultAttrsIntrinsic<
+                             [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+                             [ llvm_anyvector_ty,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,  // Mask
+                               llvm_i32_ty ],  // Segment size
+                             [ IntrNoMem, IntrNoSync, IntrWillReturn,
+                               ImmArg<ArgIndex<3>> ]>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index dcde78925bfa9..d8314af0537fe 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1352,6 +1352,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::hasVectorMatch(VectorType *VT,
+                                         unsigned SegSize) const {
+  return TTIImpl->hasVectorMatch(VT, SegSize);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9d617c7acd13c..9cb7d65975b9f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8096,6 +8096,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
              DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
     return;
   }
+  case Intrinsic::experimental_vector_match: {
+    auto *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
+    auto SegmentSize = cast<ConstantInt>(I.getOperand(3))->getLimitedValue();
+    const auto &TTI =
+        TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
+    assert(VT && TTI.hasVectorMatch(VT, SegmentSize) && "Unsupported type!");
+    visitTargetIntrinsic(I, Intrinsic);
+    return;
+  }
   case Intrinsic::vector_reverse:
     visitVectorReverse(I);
     return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7704321a0fc3a..050807142fc0a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6106,6 +6106,51 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
     return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
   }
+  case Intrinsic::experimental_vector_match: {
+    SDValue ID =
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
+
+    auto Op1 = Op.getOperand(1);
+    auto Op2 = Op.getOperand(2);
+    auto Mask = Op.getOperand(3);
+    auto SegmentSize =
+        cast<ConstantSDNode>(Op.getOperand(4))->getLimitedValue();
+
+    EVT VT = Op.getValueType();
+    auto MinNumElts = VT.getVectorMinNumElements();
+
+    assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
+    assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
+           "Custom lower only works on 128-bit segments.");
+    assert((Op1.getValueType().getVectorElementType() == MVT::i8  ||
+            Op1.getValueType().getVectorElementType() == MVT::i16) &&
+           "Custom lower only supports 8-bit or 16-bit characters.");
+    assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
+                                        "match minimum number of elements.");
+
+    if (VT.isScalableVector())
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Mask, Op1, Op2);
+
+    // We can use the SVE2 match instruction to lower this intrinsic by
+    // converting the operands to scalable vectors, doing a match, and then
+    // extracting a fixed-width subvector from the scalable vector.
+
+    EVT OpVT = Op1.getValueType();
+    EVT OpContainerVT = getContainerForFixedLengthVector(DAG, OpVT);
+    EVT MatchVT = OpContainerVT.changeElementType(MVT::i1);
+
+    auto ScalableOp1 = convertToScalableVector(DAG, OpContainerVT, Op1);
+    auto ScalableOp2 = convertToScalableVector(DAG, OpContainerVT, Op2);
+    auto ScalableMask = DAG.getNode(ISD::SIGN_EXTEND, dl, OpVT, Mask);
+    ScalableMask = convertFixedMaskToScalableVector(ScalableMask, DAG);
+
+    SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID,
+                                ScalableMask, ScalableOp1, ScalableOp2);
+
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT,
+                       DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match),
+                       DAG.getVectorIdxConstant(0, dl));
+  }
   }
 }
 
@@ -26544,6 +26589,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
+    case Intrinsic::experimental_vector_match:
     case Intrinsic::get_active_lane_mask: {
       if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
         return;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8f19fa87e2ab..806dc856c5862 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3835,6 +3835,18 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   }
 }
 
+bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
+  // Check that the target has SVE2 (and SVE is available), that `VT' is a
+  // legal type for MATCH, and that the segment size is 128-bit.
+  if (ST->hasSVE2() && ST->isSVEAvailable() &&
+      VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
+      VT->getElementCount().getKnownMinValue() == SegSize &&
+      (VT->getElementCount().getKnownMinValue() ==  8 ||
+       VT->getElementCount().getKnownMinValue() == 16))
+    return true;
+  return false;
+}
+
 InstructionCost
 AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
                                        FastMathFlags FMF,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a9189fd53f40b..6ad21a9e0a77a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->hasSVE();
   }
 
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                              std::optional<FastMathFlags> FMF,
                                              TTI::TargetCostKind CostKind);
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
new file mode 100644
index 0000000000000..0df92dfa80000
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define <vscale x 16 x i1> @match_nxv16i8(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask, i32 16)
+  ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 8 x i1> @match_nxv8i16(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask, i32 8)
+  ret <vscale x 8 x i1> %r
+}
+
+define <16 x i1> @match_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v2.16b, v2.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
+; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask, i32 16)
+  ret <16 x i1> %r
+}
+
+define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
+; CHECK-LABEL: match_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v2.8h, v2.8h, #15
+; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z2.h, #0
+; CHECK-NEXT:    match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask, i32 8)
+  ret <8 x i1> %r
+}
+
+attributes #0 = { "target-features"="+sve2" }

>From a6e26ffeb9770df51b0338fb151d9b314b192343 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 15 Jul 2024 17:57:30 +0100
Subject: [PATCH 2/2] [AArch64] Add MATCH loops to LoopIdiomVectorizePass

This patch adds a new loop to LoopIdiomVectorizePass, enabling it to
recognise and vectorise loops such as:

    template<class InputIt, class ForwardIt>
    InputIt find_first_of(InputIt first, InputIt last,
                          ForwardIt s_first, ForwardIt s_last)
    {
      for (; first != last; ++first)
        for (ForwardIt it = s_first; it != s_last; ++it)
          if (*first == *it)
            return first;
      return last;
    }

These loops match the C++ standard library's std::find_first_of.

The loops are vectorised using @experimental.vector.match which is added
separately.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 442 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/find-first-byte.ll  | 120 +++++
 2 files changed, 561 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/find-first-byte.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index cb31e2a2ecaec..a9683f08c5ab9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -79,6 +79,12 @@ static cl::opt<unsigned>
               cl::desc("The vectorization factor for byte-compare patterns."),
               cl::init(16));
 
+static cl::opt<bool>
+    DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
+                         cl::Hidden, cl::init(false),
+                         cl::desc("Proceed with Loop Idiom Vectorize Pass, but "
+                                  "do not convert find-first-byte loop(s)."));
+
 static cl::opt<bool>
     VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
                 cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
@@ -136,6 +142,21 @@ class LoopIdiomVectorize {
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
                             Value *Start, bool IncIdx, BasicBlock *FoundBB,
                             BasicBlock *EndBB);
+
+  bool recognizeFindFirstByte();
+
+  Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+                             unsigned VF, unsigned CharWidth,
+                             BasicBlock *ExitSucc, BasicBlock *ExitFail,
+                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+                             Value *StartA, Value *EndA,
+                             Value *StartB, Value *EndB);
+
+  void transformFindFirstByte(PHINode *IndPhi, unsigned VF, unsigned CharWidth,
+                              BasicBlock *ExitSucc, BasicBlock *ExitFail,
+                              GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+                              Value *StartA, Value *EndA,
+                              Value *StartB, Value *EndB);
   /// @}
 };
 } // anonymous namespace
@@ -190,7 +211,13 @@ bool LoopIdiomVectorize::run(Loop *L) {
   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
                     << CurLoop->getHeader()->getName() << "\n");
 
-  return recognizeByteCompare();
+  if (recognizeByteCompare())
+    return true;
+
+  if (recognizeFindFirstByte())
+    return true;
+
+  return false;
 }
 
 bool LoopIdiomVectorize::recognizeByteCompare() {
@@ -941,3 +968,416 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
       report_fatal_error("Loops must remain in LCSSA form!");
   }
 }
+
+bool LoopIdiomVectorize::recognizeFindFirstByte() {
+  // Currently the transformation only works on scalable vector types, although
+  // there is no fundamental reason why it cannot be made to work for fixed
+  // width too.
+  if (!TTI->supportsScalableVectors() || DisableFindFirstByte)
+    return false;
+
+  // Define some constants we need throughout.
+  // TODO: Some of these could be made configurable parameters. For example, we
+  // could allow CharWidth = 16 (and VF = 8).
+  unsigned VF = 16;
+  unsigned CharWidth = 8;
+  BasicBlock *Header = CurLoop->getHeader();
+  LLVMContext &Ctx = Header->getContext();
+  auto *CharTy = Type::getIntNTy(Ctx, CharWidth);
+  auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+
+  // Check if the target supports efficient vector matches for vectors of
+  // bytes.
+  if (!TTI->hasVectorMatch(CharVTy, VF))
+    return false;
+
+  // In LoopIdiomVectorize::run we have already checked that the loop has a
+  // preheader so we can assume it's in a canonical form.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4)
+    return false;
+
+  // We expect this loop to have one nested loop.
+  if (CurLoop->getSubLoops().size() != 1)
+    return false;
+
+  auto *InnerLoop = CurLoop->getSubLoops().front();
+  PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
+
+  if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
+    return false;
+
+  auto LoopBlocks = CurLoop->getBlocks();
+  // We are expecting the following blocks below. For now, we will bail out for
+  // anything deviating from this.
+  //
+  // .preheader:                                       ; preds = %.preheader.preheader, %23
+  //   %14 = phi ptr [ %24, %23 ], [ %3, %.preheader.preheader ]
+  //   %15 = load i8, ptr %14, align 1, !tbaa !14
+  //   br label %19
+  //
+  // 19:                                               ; preds = %16, %.preheader
+  //   %20 = phi ptr [ %7, %.preheader ], [ %17, %16 ]
+  //   %21 = load i8, ptr %20, align 1, !tbaa !14
+  //   %22 = icmp eq i8 %15, %21
+  //   br i1 %22, label %.loopexit.loopexit, label %16
+  //
+  // 16:                                               ; preds = %19
+  //   %17 = getelementptr inbounds i8, ptr %20, i64 1
+  //   %18 = icmp eq ptr %17, %10
+  //   br i1 %18, label %23, label %19, !llvm.loop !15
+  //
+  // 23:                                               ; preds = %16
+  //   %24 = getelementptr inbounds i8, ptr %14, i64 1
+  //   %25 = icmp eq ptr %24, %6
+  //   br i1 %25, label %.loopexit.loopexit5, label %.preheader, !llvm.loop !17
+  //
+  if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
+      LoopBlocks[1]->sizeWithoutDebug() > 4 ||
+      LoopBlocks[2]->sizeWithoutDebug() > 3 ||
+      LoopBlocks[3]->sizeWithoutDebug() > 3)
+    return false;
+
+  // If we match the pattern, IndPhi is going to be replaced. We cannot replace
+  // the loop if any other of its instructions are used outside of it.
+  for (BasicBlock *BB : LoopBlocks)
+    for (Instruction &I : *BB)
+      if (&I != IndPhi)
+        for (User *U : I.users())
+          if (!CurLoop->contains(cast<Instruction>(U)))
+            return false;
+
+  // Match the branch instruction for the header. We are expecting an
+  // unconditional branch to the inner loop.
+  BasicBlock *MatchBB;
+  if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
+      !InnerLoop->contains(MatchBB))
+    return false;
+
+  // MatchBB should be the entrypoint into the inner loop containing the
+  // comparison between a search item and a valid/successful match.
+  ICmpInst::Predicate MatchPred;
+  BasicBlock *ExitSucc;
+  BasicBlock *InnerBB;
+  Value *LoadA, *LoadB;
+  if (!match(MatchBB->getTerminator(),
+             m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)),
+                  m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
+      MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+      !InnerLoop->contains(InnerBB))
+    return false;
+
+  // We expect a single use of IndPhi outside of CurLoop. The outside use
+  // should be a PHINode in ExitSucc coming from MatchBB.
+  // Note: Strictly speaking we are not checking for a *single* use of IndPhi
+  // outside of CurLoop here, but below we check that we only exit CurLoop to
+  // ExitSucc in one place, so by construction this should be true. Besides, in
+  // the event it is not, as long as the use is a PHINode in ExitSucc and comes
+  // from MatchBB, the transformation should still be valid in any case.
+  for (Use &U : IndPhi->uses())
+    if (CurLoop->contains(cast<Instruction>(U.getUser())))
+      continue;
+    else if (auto *PN = dyn_cast<PHINode>(U.getUser());
+             !PN || PN->getParent() != ExitSucc ||
+             PN->getIncomingBlock(U) != MatchBB)
+      return false;
+
+  // Match the loads.
+  Value *A, *B;
+  if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
+    return false;
+
+  // Make sure they are simple.
+  LoadInst *LoadAI = cast<LoadInst>(LoadA);
+  LoadInst *LoadBI = cast<LoadInst>(LoadB);
+  if (!LoadAI->isSimple() || !LoadBI->isSimple())
+    return false;
+
+  // The values loaded come from two PHIs that can only have two incoming
+  // values.
+  PHINode *PNA = dyn_cast<PHINode>(A);
+  PHINode *PNB = dyn_cast<PHINode>(B);
+  if (!PNA || PNA->getNumIncomingValues() != 2 ||
+      !PNB || PNB->getNumIncomingValues() != 2)
+    return false;
+
+  // One PHI comes from the outer loop, the other one from the inner loop.
+  // CurLoop contains PNA, InnerLoop PNB.
+  if (InnerLoop->contains(PNA))
+    std::swap(PNA, PNB);
+  if (PNA != &Header->front() || PNB != &MatchBB->front())
+    return false;
+
+  // The incoming values of both PHI nodes should be a gep of 1.
+  Value *StartA = PNA->getIncomingValue(0);
+  Value *IndexA = PNA->getIncomingValue(1);
+  if (CurLoop->contains(PNA->getIncomingBlock(0)))
+    std::swap(StartA, IndexA);
+
+  Value *StartB = PNB->getIncomingValue(0);
+  Value *IndexB = PNB->getIncomingValue(1);
+  if (InnerLoop->contains(PNB->getIncomingBlock(0)))
+    std::swap(StartB, IndexB);
+
+  // Match the GEPs.
+  if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) ||
+      !match(IndexB, m_GEP(m_Specific(PNB), m_One())))
+    return false;
+
+  GetElementPtrInst *GEPA = cast<GetElementPtrInst>(IndexA);
+  GetElementPtrInst *GEPB = cast<GetElementPtrInst>(IndexB);
+
+  // Check we are loading CharTy values.
+  if (!GEPA->getResultElementType()->isIntegerTy(CharWidth) ||
+      !GEPB->getResultElementType()->isIntegerTy(CharWidth) ||
+      !LoadAI->getType()->isIntegerTy(CharWidth) ||
+      !LoadBI->getType()->isIntegerTy(CharWidth))
+    return false;
+
+  // InnerBB should increment the address of the key we are checking.
+  BasicBlock *OuterBB;
+  Value *EndB;
+  if (!match(InnerBB->getTerminator(),
+             m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)),
+                  m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
+      MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+      !CurLoop->contains(OuterBB))
+    return false;
+
+  // OuterBB should increment the address of the element we are looking for.
+  Value *EndA;
+  BasicBlock *ExitFail;
+  if (!match(OuterBB->getTerminator(),
+             m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)),
+                  m_BasicBlock(ExitFail), m_Specific(Header))) ||
+      MatchPred != ICmpInst::Predicate::ICMP_EQ)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
+                    << *CurLoop << "\n\n");
+
+  transformFindFirstByte(IndPhi, VF, CharWidth, ExitSucc, ExitFail, GEPA, GEPB,
+                         StartA, EndA, StartB, EndB);
+  return true;
+}
+
+Value *LoopIdiomVectorize::expandFindFirstByte(
+    IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, unsigned CharWidth,
+    BasicBlock *ExitSucc, BasicBlock *ExitFail,
+    GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+    Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
+  // Set up some types and constants that we intend to reuse.
+  auto *I64Ty = Builder.getInt64Ty();
+  auto *I32Ty = Builder.getInt32Ty();
+  auto *PtrTy = Builder.getPtrTy();
+  auto *CharTy = Builder.getIntNTy(CharWidth);
+  auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
+  auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+
+  // Other common arguments.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+  LLVMContext &Ctx = PHBranch->getContext();
+  Value *Passthru = ConstantInt::getNullValue(CharVTy);
+
+  // Split block in the original loop preheader.
+  BasicBlock *OldPH = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "oldph");
+
+  // Create the blocks that we're going to need. We separate them among outer
+  // (OL) and inner (IL) loops with functions similar to those in the original
+  // loops.
+  //   1. Check that we have at least one element to load. (OL)
+  //   2. Set up masks and load a vector of elements. (OL)
+  //   3. Check that we have at least one key to match against. (IL)
+  //   4. Check whether we can load a full register of keys. (IL)
+  //   5.   If so, load it. (IL)
+  //   6.   If not, set up a new mask, load the keys possible, and splat the
+  //        first one to the remainder of the register. (IL)
+  //   7. Carry out the match test; if successful go to (8), otherwise loop
+  //      back to (3). (IL)
+  //   8. Figure out the index of the match.
+  // Note that only block (8) is *not* part of a loop (inner or outer).
+
+  BasicBlock *BB1 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB2 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB3 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB4 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB5 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB6 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB7 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+  BasicBlock *BB8 = BasicBlock::Create(Ctx, "", OldPH->getParent(), OldPH);
+
+  // Update LoopInfo with the new loops.
+  auto OuterLoop = LI->AllocateLoop();
+  auto InnerLoop = LI->AllocateLoop();
+
+  if (CurLoop->getParentLoop()) {
+    CurLoop->getParentLoop()->addBasicBlockToLoop(BB8, *LI);
+    CurLoop->getParentLoop()->addChildLoop(OuterLoop);
+  } else {
+    LI->addTopLevelLoop(OuterLoop);
+  }
+
+  // Add the inner loop to the outer.
+  OuterLoop->addChildLoop(InnerLoop);
+
+  // Add the new basic blocks to the corresponding loops.
+  OuterLoop->addBasicBlockToLoop(BB1, *LI);
+  OuterLoop->addBasicBlockToLoop(BB2, *LI);
+  InnerLoop->addBasicBlockToLoop(BB3, *LI);
+  InnerLoop->addBasicBlockToLoop(BB4, *LI);
+  InnerLoop->addBasicBlockToLoop(BB5, *LI);
+  InnerLoop->addBasicBlockToLoop(BB6, *LI);
+  InnerLoop->addBasicBlockToLoop(BB7, *LI);
+
+  // Update the terminator added by SplitBlock to branch to the first block
+  Preheader->getTerminator()->setSuccessor(0, BB1);
+  DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1},
+                    {DominatorTree::Delete, Preheader, OldPH}});
+
+  // (1) Check the outer loop iteration.
+  Builder.SetInsertPoint(BB1);
+  PHINode *PNA = Builder.CreatePHI(PtrTy, 2, "pna");
+  Value *CheckA = Builder.CreateICmpULT(PNA, EndA);
+  Builder.CreateCondBr(CheckA, BB2, ExitFail);
+  DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2},
+                    {DominatorTree::Insert, BB1, ExitFail}});
+
+  // (2) Outer loop body.
+  Builder.SetInsertPoint(BB2);
+  Value *IncA = Builder.CreateGEP(CharTy, PNA, ConstantInt::get(I64Ty, VF), "",
+                                  GEPA->isInBounds());
+  Value *CheckIncA = Builder.CreateICmpUGT(IncA, EndA);
+  Value *SelA = Builder.CreateSelect(CheckIncA, EndA, IncA);
+  Value *PredA = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+      {Builder.CreatePointerCast(PNA, I64Ty),
+       Builder.CreatePointerCast(SelA, I64Ty)});
+  Value *LoadA =
+      Builder.CreateMaskedLoad(CharVTy, PNA, Align(1), PredA, Passthru);
+  Value *PredBInit = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+      {ConstantInt::get(I64Ty, 0), ConstantInt::get(I64Ty, VF)});
+  Builder.CreateBr(BB3);
+  DTU.applyUpdates({{DominatorTree::Insert, BB2, BB3}});
+
+  // (3) Check the inner loop iteration.
+  Builder.SetInsertPoint(BB3);
+  PHINode *PNB = Builder.CreatePHI(PtrTy, 2, "pnb");
+  PHINode *PredBFull = Builder.CreatePHI(PredVTy, 2);
+  Value *CheckB = Builder.CreateICmpULT(PNB, EndB);
+  Builder.CreateCondBr(CheckB, BB4, BB1);
+  DTU.applyUpdates({{DominatorTree::Insert, BB3, BB4},
+                    {DominatorTree::Insert, BB3, BB1}});
+
+  // (4) Check load B.
+  Builder.SetInsertPoint(BB4);
+  Value *IncB = Builder.CreateGEP(CharTy, PNB, ConstantInt::get(I64Ty, VF), "",
+                                  GEPB->isInBounds());
+  Value *IfNotFullB = Builder.CreateICmpUGT(IncB, EndB);
+  Builder.CreateCondBr(IfNotFullB, BB6, BB5);
+  DTU.applyUpdates({{DominatorTree::Insert, BB4, BB6},
+                    {DominatorTree::Insert, BB4, BB5}});
+
+  // (5) Full load B.
+  Builder.SetInsertPoint(BB5);
+  Value *LoadBFull =
+      Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBFull, Passthru);
+  Builder.CreateBr(BB7);
+  DTU.applyUpdates({{DominatorTree::Insert, BB5, BB7}});
+
+  // (6) Partial load B.
+  Builder.SetInsertPoint(BB6);
+  Value *PredBPart = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+      {Builder.CreatePointerCast(PNB, I64Ty),
+       Builder.CreatePointerCast(EndB, I64Ty)});
+  Value *LoadBPart =
+      Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredBPart, Passthru);
+  Value *LoadB0 = Builder.CreateExtractElement(LoadBPart, uint64_t(0));
+  Value *LoadBSplat =
+      Builder.CreateVectorSplat(PredVTy->getElementCount(), LoadB0);
+  LoadBPart = Builder.CreateSelect(PredBPart, LoadBPart, LoadBSplat);
+  Builder.CreateBr(BB7);
+  DTU.applyUpdates({{DominatorTree::Insert, BB6, BB7}});
+
+  // (7) Carry out match.
+  Builder.SetInsertPoint(BB7);
+  PHINode *PredBNext = Builder.CreatePHI(PredVTy, 2);
+  PHINode *LoadB = Builder.CreatePHI(CharVTy, 2);
+  Value *MatchPred = Builder.CreateIntrinsic(
+      Intrinsic::experimental_vector_match, {CharVTy},
+      {LoadA, LoadB, PredA, ConstantInt::get(I32Ty, VF)});
+  Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+  Builder.CreateCondBr(IfAnyMatch, BB8, BB3);
+  DTU.applyUpdates({{DominatorTree::Insert, BB7, BB8},
+                    {DominatorTree::Insert, BB7, BB3}});
+
+  // (8) Match success.
+  Builder.SetInsertPoint(BB8);
+  Value *MatchCnt = Builder.CreateIntrinsic(
+      Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
+      {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
+  Value *MatchVal = Builder.CreateGEP(CharTy, PNA, MatchCnt);
+  Builder.CreateBr(ExitSucc);
+  DTU.applyUpdates({{DominatorTree::Insert, BB8, ExitSucc}});
+
+  // Set incoming values for PHIs.
+  PNA->addIncoming(StartA, Preheader);
+  PNA->addIncoming(IncA, BB3);
+
+  PNB->addIncoming(StartB, BB2);
+  PNB->addIncoming(IncB, BB7);
+  PredBFull->addIncoming(PredBInit, BB2);
+  PredBFull->addIncoming(PredBNext, BB7);
+
+  PredBNext->addIncoming(PredBFull, BB5);
+  PredBNext->addIncoming(PredBPart, BB6);
+  LoadB->addIncoming(LoadBFull, BB5);
+  LoadB->addIncoming(LoadBPart, BB6);
+
+  if (VerifyLoops) {
+    OuterLoop->verifyLoop();
+    InnerLoop->verifyLoop();
+    if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
+      report_fatal_error("Loops must remain in LCSSA form!");
+    if (!InnerLoop->isRecursivelyLCSSAForm(*DT, *LI))
+      report_fatal_error("Loops must remain in LCSSA form!");
+  }
+
+  assert(OldPH->hasNPredecessors(0) && "Expected old loop to be unreachable.");
+
+  return MatchVal;
+}
+
+void LoopIdiomVectorize::transformFindFirstByte(
+    PHINode *IndPhi, unsigned VF, unsigned CharWidth,
+    BasicBlock *ExitSucc, BasicBlock *ExitFail,
+    GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+    Value *StartA, Value *EndA, Value *StartB, Value *EndB) {
+  // Insert the find first byte code at the end of the preheader block.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+  IRBuilder<> Builder(PHBranch);
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+  Value *MatchVal =
+      expandFindFirstByte(Builder, DTU, VF, CharWidth, ExitSucc, ExitFail,
+                          GEPA, GEPB, StartA, EndA, StartB, EndB);
+
+  assert(PHBranch->isUnconditional() &&
+         "Expected preheader to terminate with an unconditional branch.");
+
+  // Add new incoming values with the result of the transformation to the old
+  // uses of IndPhi in ExitSucc.
+  for (auto &PN : ExitSucc->phis())
+    for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+      if (PN.getIncomingValue(i) == IndPhi)
+        PN.addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+
+  // Maybe EliminateUnreachableBlocks ? I've left them for now because we may
+  // want to reuse them to implement an alternative path for small arrays, for
+  // example.
+
+  //dbgs() << *Preheader->getParent() << "\n";
+}
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
new file mode 100644
index 0000000000000..4bf3df1d4817b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mattr=+sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize),simplifycfg' -S < %s | FileCheck -check-prefix=SVE2 %s
+; RUN: opt -mattr=-sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize),simplifycfg' -S < %s | FileCheck -check-prefix=NOSVE2 %s
+
+define dso_local noundef ptr @first_byte_of(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; SVE2-LABEL: define dso_local noundef ptr @first_byte_of(
+; SVE2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE2-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; SVE2-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; SVE2-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; SVE2-NEXT:    br i1 [[TMP7]], [[DOTLOOPEXIT1:label %.*]], label %[[DOTPREHEADER:.*]]
+; SVE2:       [[_PREHEADER:.*:]]
+; SVE2-NEXT:    [[PNA:%.*]] = phi ptr [ [[TMP10:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], [[TMP4:%.*]] ]
+; SVE2-NEXT:    [[TMP8:%.*]] = icmp ult ptr [[PNA]], [[TMP1]]
+; SVE2-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], [[DOTLOOPEXIT1]]
+; SVE2:       [[BB9]]:
+; SVE2-NEXT:    [[TMP10]] = getelementptr inbounds i8, ptr [[PNA]], i64 16
+; SVE2-NEXT:    [[TMP11:%.*]] = icmp ugt ptr [[TMP10]], [[TMP1]]
+; SVE2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], ptr [[TMP1]], ptr [[TMP10]]
+; SVE2-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[PNA]] to i64
+; SVE2-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP12]] to i64
+; SVE2-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP13]], i64 [[TMP14]])
+; SVE2-NEXT:    [[TMP16:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNA]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; SVE2-NEXT:    br label %[[TMP18]]
+; SVE2:       [[TMP18]]:
+; SVE2-NEXT:    [[PNB:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP22:%.*]], %[[TMP33:.*]] ]
+; SVE2-NEXT:    [[TMP19:%.*]] = phi <vscale x 16 x i1> [ [[TMP17]], %[[BB9]] ], [ [[TMP34:%.*]], %[[TMP33]] ]
+; SVE2-NEXT:    [[TMP20:%.*]] = icmp ult ptr [[PNB]], [[TMP3]]
+; SVE2-NEXT:    br i1 [[TMP20]], label %[[BB21:.*]], label %[[DOTPREHEADER]]
+; SVE2:       [[BB21]]:
+; SVE2-NEXT:    [[TMP22]] = getelementptr inbounds i8, ptr [[PNB]], i64 16
+; SVE2-NEXT:    [[TMP23:%.*]] = icmp ugt ptr [[TMP22]], [[TMP3]]
+; SVE2-NEXT:    br i1 [[TMP23]], label %[[BB26:.*]], label %[[BB24:.*]]
+; SVE2:       [[BB24]]:
+; SVE2-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNB]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT:    br label %[[TMP33]]
+; SVE2:       [[BB26]]:
+; SVE2-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[PNB]] to i64
+; SVE2-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; SVE2-NEXT:    [[TMP29:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP28]])
+; SVE2-NEXT:    [[TMP30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNB]], i32 1, <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> zeroinitializer)
+; SVE2-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i8> [[TMP30]], i64 0
+; SVE2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP31]], i64 0
+; SVE2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; SVE2-NEXT:    [[TMP32:%.*]] = select <vscale x 16 x i1> [[TMP29]], <vscale x 16 x i8> [[TMP30]], <vscale x 16 x i8> [[DOTSPLAT]]
+; SVE2-NEXT:    br label %[[TMP33]]
+; SVE2:       [[TMP33]]:
+; SVE2-NEXT:    [[TMP34]] = phi <vscale x 16 x i1> [ [[TMP19]], %[[BB24]] ], [ [[TMP29]], %[[BB26]] ]
+; SVE2-NEXT:    [[TMP35:%.*]] = phi <vscale x 16 x i8> [ [[TMP25]], %[[BB24]] ], [ [[TMP32]], %[[BB26]] ]
+; SVE2-NEXT:    [[TMP36:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP35]], <vscale x 16 x i1> [[TMP15]], i32 16)
+; SVE2-NEXT:    [[TMP37:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP36]])
+; SVE2-NEXT:    br i1 [[TMP37]], label %[[DOTLOOPEXIT:.*]], label %[[TMP18]]
+; SVE2:       [[_LOOPEXIT:.*:]]
+; SVE2-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP36]], i1 true)
+; SVE2-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[PNA]], i64 [[TMP38]]
+; SVE2-NEXT:    br [[DOTLOOPEXIT1]]
+; SVE2:       [[_LOOPEXIT1:.*:]]
+; SVE2-NEXT:    [[TMP40:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP39]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTPREHEADER]] ]
+; SVE2-NEXT:    ret ptr [[TMP40]]
+;
+; NOSVE2-LABEL: define dso_local noundef ptr @first_byte_of(
+; NOSVE2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOSVE2-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; NOSVE2-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; NOSVE2-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; NOSVE2-NEXT:    br i1 [[TMP7]], [[DOTLOOPEXIT:label %.*]], label %[[DOTPREHEADER:.*]]
+; NOSVE2:       [[_LOOPEXIT:.*:]]
+; NOSVE2-NEXT:    [[TMP8:%.*]] = phi ptr [ [[TMP18:%.*]], %[[TMP17:.*]] ], [ [[TMP0]], [[TMP4:%.*]] ]
+; NOSVE2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
+; NOSVE2-NEXT:    br label %[[BB13:.*]]
+; NOSVE2:       [[BB10:.*]]:
+; NOSVE2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP14:%.*]], i64 1
+; NOSVE2-NEXT:    [[TMP12:%.*]] = icmp eq ptr [[TMP11]], [[TMP3]]
+; NOSVE2-NEXT:    br i1 [[TMP12]], label %[[TMP17]], label %[[BB13]]
+; NOSVE2:       [[BB13]]:
+; NOSVE2-NEXT:    [[TMP14]] = phi ptr [ [[TMP2]], %[[DOTPREHEADER]] ], [ [[TMP11]], %[[BB10]] ]
+; NOSVE2-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; NOSVE2-NEXT:    [[TMP16:%.*]] = icmp eq i8 [[TMP9]], [[TMP15]]
+; NOSVE2-NEXT:    br i1 [[TMP16]], [[DOTLOOPEXIT]], label %[[BB10]]
+; NOSVE2:       [[TMP17]]:
+; NOSVE2-NEXT:    [[TMP18]] = getelementptr inbounds i8, ptr [[TMP8]], i64 1
+; NOSVE2-NEXT:    [[TMP19:%.*]] = icmp eq ptr [[TMP18]], [[TMP1]]
+; NOSVE2-NEXT:    br i1 [[TMP19]], [[DOTLOOPEXIT]], label %[[DOTPREHEADER]]
+; NOSVE2:       [[_LOOPEXIT2:.*:]]
+; NOSVE2-NEXT:    [[TMP40:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP8]], %[[BB13]] ], [ [[TMP1]], %[[TMP17]] ]
+; NOSVE2-NEXT:    ret ptr [[TMP40]]
+;
+  %5 = icmp eq ptr %0, %1
+  %6 = icmp eq ptr %2, %3
+  %7 = or i1 %5, %6
+  br i1 %7, label %21, label %8
+
+8:
+  %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+  %10 = load i8, ptr %9, align 1
+  br label %14
+
+11:
+  %12 = getelementptr inbounds i8, ptr %15, i64 1
+  %13 = icmp eq ptr %12, %3
+  br i1 %13, label %18, label %14
+
+14:
+  %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+  %16 = load i8, ptr %15, align 1
+  %17 = icmp eq i8 %10, %16
+  br i1 %17, label %21, label %11
+
+18:
+  %19 = getelementptr inbounds i8, ptr %9, i64 1
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %8
+
+21:
+  %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+  ret ptr %22
+}
+
+attributes #0 = { "target-features"="+sve2" }