[llvm] [LoopIdiomVectorize] Test all needles when vectorising find_first_of loops. (PR #179298)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 10 05:16:08 PST 2026
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/179298
>From 0cb99bd8ee69f604b836f34236194720f9b5a936 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 2 Feb 2026 05:02:11 -0800
Subject: [PATCH 1/4] [AArch64] Test all needles when vectorising find_first_of
loops.
As described in #179187, the current FindFirstByte transformation in
LoopIdiomVectorizePass will incorrectly early-exit as soon as a needle
matching a search element is found, even if a prior element of the
search vector may match a subsequent needle.
This patch implements the strategy described in the issue to ensure all
needles are tested before we return a matching search element.
---
.../Vectorize/LoopIdiomVectorize.cpp | 71 ++++++-----
.../LoopIdiom/AArch64/find-first-byte.ll | 115 ++++++++++--------
2 files changed, 104 insertions(+), 82 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 53129e2e5fbba..d052eb4201dfe 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -1199,23 +1199,23 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (1) Load the search array. Go to (2).
// (2) (a) Load the needle array.
// (b) Splat the first element to the inactive lanes.
- // (c) Check if any elements match. If so go to (3), otherwise go to (4).
- // (3) Compute the index of the first match and exit.
- // (4) Check if we've reached the end of the needle array. If not loop back to
- // (2), otherwise go to (5).
+ // (c) Accumulate any matches found. If we haven't reached the end of the
+ // needle array loop back to (2), otherwise go to (3).
+ // (3) Test if we found any match. If so go to (4), otherwise go to (5).
+ // (4) Compute the index of the first match and exit.
// (5) Check if we've reached the end of the search array. If not loop back to
// (1), otherwise exit.
- // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to
- // the outer and inner loops, respectively.
+ // Blocks (0,4) are not part of any loop. Blocks (1,3,5) and (2) belong to the
+ // outer and inner loops, respectively.
BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH);
BasicBlock *BB1 =
BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH);
BasicBlock *BB2 =
- BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH);
+ BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH);
BasicBlock *BB3 =
- BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH);
+ BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH);
BasicBlock *BB4 =
- BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH);
+ BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH);
BasicBlock *BB5 =
BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH);
@@ -1226,7 +1226,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
if (auto ParentLoop = CurLoop->getParentLoop()) {
ParentLoop->addBasicBlockToLoop(BB0, *LI);
ParentLoop->addChildLoop(OuterLoop);
- ParentLoop->addBasicBlockToLoop(BB3, *LI);
+ ParentLoop->addBasicBlockToLoop(BB4, *LI);
} else {
LI->addTopLevelLoop(OuterLoop);
}
@@ -1236,9 +1236,9 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// Add the new basic blocks to the corresponding loops.
OuterLoop->addBasicBlockToLoop(BB1, *LI);
+ OuterLoop->addBasicBlockToLoop(BB3, *LI);
OuterLoop->addBasicBlockToLoop(BB5, *LI);
InnerLoop->addBasicBlockToLoop(BB2, *LI);
- InnerLoop->addBasicBlockToLoop(BB4, *LI);
// Update the terminator added by SplitBlock to branch to the first block.
Preheader->getTerminator()->setSuccessor(0, BB0);
@@ -1294,12 +1294,14 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
Value *LoadSearch = Builder.CreateMaskedLoad(
CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
+ Value *MatchInit = Constant::getNullValue(PredVTy);
Builder.CreateBr(BB2);
DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
// (2) Inner loop.
Builder.SetInsertPoint(BB2);
PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
+ PHINode *Match = Builder.CreatePHI(PredVTy, 2, "pmatch");
// (2.a) Load the needle array.
Value *PredNeedle = Builder.CreateIntrinsic(
@@ -1320,17 +1322,27 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
LoadNeedle = Builder.CreateExtractVector(
FixedVectorType::get(CharTy, VF), LoadNeedle, uint64_t(0), "needle_vec");
- // (2.c) Test if there's a match.
- Value *MatchPred = Builder.CreateIntrinsic(
+ // (2.c) Accumulate matches.
+ Value *MatchSeg = Builder.CreateIntrinsic(
Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
- {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_pred");
- Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
- Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
+ {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_segment");
+ Value *MatchAcc = Builder.CreateOr(Match, MatchSeg, "match_accumulator");
+ Value *NextNeedle =
+ Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB3);
DTU.applyUpdates(
- {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}});
+ {{DominatorTree::Insert, BB2, BB2}, {DominatorTree::Insert, BB2, BB3}});
- // (3) We found a match. Compute the index of its location and exit.
+ // (3) Check if we found a match.
Builder.SetInsertPoint(BB3);
+ PHINode *MatchPred = Builder.CreatePHI(PredVTy, 1, "match_pred");
+ Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+ Builder.CreateCondBr(IfAnyMatch, BB4, BB5);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB3, BB4}, {DominatorTree::Insert, BB3, BB5}});
+
+ // (4) We found a match. Compute the index of its location and exit.
+ Builder.SetInsertPoint(BB4);
PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start");
PHINode *MatchPredLCSSA =
Builder.CreatePHI(MatchPred->getType(), 1, "match_vec");
@@ -1341,15 +1353,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Value *MatchVal =
Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res");
Builder.CreateBr(ExitSucc);
- DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
-
- // (4) Check if we've reached the end of the needle array.
- Builder.SetInsertPoint(BB4);
- Value *NextNeedle =
- Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
- Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5);
- DTU.applyUpdates(
- {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
+ DTU.applyUpdates({{DominatorTree::Insert, BB4, ExitSucc}});
// (5) Check if we've reached the end of the search array.
Builder.SetInsertPoint(BB5);
@@ -1364,14 +1368,17 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Search->addIncoming(SearchStart, BB0);
Search->addIncoming(NextSearch, BB5);
Needle->addIncoming(NeedleStart, BB1);
- Needle->addIncoming(NextNeedle, BB4);
+ Needle->addIncoming(NextNeedle, BB2);
+ Match->addIncoming(MatchInit, BB1);
+ Match->addIncoming(MatchAcc, BB2);
// These are needed to retain LCSSA form.
- MatchLCSSA->addIncoming(Search, BB2);
- MatchPredLCSSA->addIncoming(MatchPred, BB2);
+ MatchPred->addIncoming(MatchAcc, BB2);
+ MatchLCSSA->addIncoming(Search, BB3);
+ MatchPredLCSSA->addIncoming(MatchPred, BB3);
- // Ensure all Phis in the successors of BB3/BB5 have an incoming value from
+ // Ensure all Phis in the successors of BB4/BB5 have an incoming value from
// them.
- fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3);
+ fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB4);
if (ExitSucc != ExitFail)
fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5);
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
index 62d15b591c256..0ad9f1dc4c859 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -40,9 +40,10 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
-; CHECK: [[MATCH_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
@@ -52,19 +53,21 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
-; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
-; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
-; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[SEARCH_CHECK_VEC]]:
; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
@@ -196,9 +199,10 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PSEARCH]], <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
-; CHECK: [[MATCH_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
@@ -208,19 +212,21 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], <vscale x 8 x i16> [[NEEDLE0_SPLAT]]
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[NEEDLE_SPLAT]], i64 0)
-; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], <vscale x 8 x i1> [[SEARCH_MASKED]])
-; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], <vscale x 8 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 8 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[MATCH_VEC]], i1 true)
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
-; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[SEARCH_CHECK_VEC]]:
; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
@@ -352,9 +358,10 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
-; CHECK: [[MATCH_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
@@ -364,19 +371,21 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
-; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
-; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_SUCC:.*]]
-; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[SEARCH_CHECK_VEC]]:
; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
@@ -515,9 +524,10 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
-; CHECK: [[MATCH_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
@@ -527,19 +537,21 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
-; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
-; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[FOUND_MATCH:.*]]
-; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[SEARCH_CHECK_VEC]]:
; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
@@ -666,9 +678,10 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]]
-; CHECK: [[MATCH_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
+; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
@@ -678,19 +691,21 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], <vscale x 16 x i8> [[NEEDLE0_SPLAT]]
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
-; CHECK-NEXT: [[MATCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
-; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]]
+; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[MATCH_VEC]], i1 true)
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[FOUND_MATCH:.*]]
-; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[SEARCH_CHECK_VEC]]:
; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
>From ed76893c03c4452c8aba0cb9434ba39382b125fe Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 3 Feb 2026 09:53:30 -0800
Subject: [PATCH 2/4] Avoid pointer wrap.
---
.../Vectorize/LoopIdiomVectorize.cpp | 53 +++--
.../LoopIdiom/AArch64/find-first-byte.ll | 185 +++++++-----------
2 files changed, 104 insertions(+), 134 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index d052eb4201dfe..d76df8f417e1c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -1069,9 +1069,10 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
!cast<LoadInst>(LoadNeedle)->isSimple())
return false;
- // Check we are loading valid characters.
+ // Check we are loading valid characters (currently limited to i8).
+ // Other types could be accepted but require more precise trip count handling.
Type *CharTy = LoadSearch->getType();
- if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy)
+ if (!CharTy->isIntegerTy(8) || LoadNeedle->getType() != CharTy)
return false;
// Pick the vectorisation factor based on CharTy, work out the cost of the
@@ -1086,6 +1087,13 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
return false;
+ // We exclude loops with trip counts > minimum page size via runtime checks.
+ // As MinPageSize - 1 + VF is an upper bound on the maximum value the loops'
+ // induction variables may take, so long as this expression doesn't wrap with
+ // 64-bit arithmetic, the IVs can't wrap either.
+ if (uint64_t(*TTI->getMinPageSize() - 1) + VF < VF)
+ return false;
+
// The loads come from two PHIs, each with two incoming values.
PHINode *PSearch = dyn_cast<PHINode>(Search);
PHINode *PNeedle = dyn_cast<PHINode>(Needle);
@@ -1253,10 +1261,16 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int");
Value *ISearchEnd =
Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int");
+ Value *SearchIdxInit = Constant::getNullValue(I64Ty);
+ Value *SearchTripCount =
+ Builder.CreateSub(ISearchEnd, ISearchStart, "search_trip_count");
Value *INeedleStart =
Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int");
Value *INeedleEnd =
Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int");
+ Value *NeedleIdxInit = Constant::getNullValue(I64Ty);
+ Value *NeedleTripCount =
+ Builder.CreateSub(INeedleEnd, INeedleStart, "needle_trip_count");
Value *PredVF =
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
{ConstantInt::get(I64Ty, 0), ConstVF});
@@ -1286,12 +1300,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (1) Load the search array and branch to the inner loop.
Builder.SetInsertPoint(BB1);
- PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
+ PHINode *SearchIdx = Builder.CreatePHI(I64Ty, 2, "search_idx");
Value *PredSearch = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr,
- "search_pred");
+ {SearchIdx, SearchTripCount}, nullptr, "search_pred");
PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
+ Value *Search = Builder.CreateGEP(CharTy, SearchStart, SearchIdx, "psearch");
Value *LoadSearch = Builder.CreateMaskedLoad(
CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
Value *MatchInit = Constant::getNullValue(PredVTy);
@@ -1300,15 +1314,15 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (2) Inner loop.
Builder.SetInsertPoint(BB2);
- PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
+ PHINode *NeedleIdx = Builder.CreatePHI(I64Ty, 2, "needle_idx");
PHINode *Match = Builder.CreatePHI(PredVTy, 2, "pmatch");
// (2.a) Load the needle array.
Value *PredNeedle = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr,
- "needle_pred");
+ {NeedleIdx, NeedleTripCount}, nullptr, "needle_pred");
PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked");
+ Value *Needle = Builder.CreateGEP(CharTy, NeedleStart, NeedleIdx, "pneedle");
Value *LoadNeedle = Builder.CreateMaskedLoad(
CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec");
@@ -1327,9 +1341,10 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
{LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_segment");
Value *MatchAcc = Builder.CreateOr(Match, MatchSeg, "match_accumulator");
- Value *NextNeedle =
- Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
- Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB3);
+ Value *NextNeedleIdx =
+ Builder.CreateAdd(NeedleIdx, ConstVF, "needle_idx_next");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedleIdx, NeedleTripCount),
+ BB2, BB3);
DTU.applyUpdates(
{{DominatorTree::Insert, BB2, BB2}, {DominatorTree::Insert, BB2, BB3}});
@@ -1357,18 +1372,18 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (5) Check if we've reached the end of the search array.
Builder.SetInsertPoint(BB5);
- Value *NextSearch =
- Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec");
- Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
- ExitFail);
+ Value *NextSearchIdx =
+ Builder.CreateAdd(SearchIdx, ConstVF, "search_idx_next");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextSearchIdx, SearchTripCount),
+ BB1, ExitFail);
DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
{DominatorTree::Insert, BB5, ExitFail}});
// Set up the PHI nodes.
- Search->addIncoming(SearchStart, BB0);
- Search->addIncoming(NextSearch, BB5);
- Needle->addIncoming(NeedleStart, BB1);
- Needle->addIncoming(NextNeedle, BB2);
+ SearchIdx->addIncoming(SearchIdxInit, BB0);
+ SearchIdx->addIncoming(NextSearchIdx, BB5);
+ NeedleIdx->addIncoming(NeedleIdxInit, BB1);
+ NeedleIdx->addIncoming(NextNeedleIdx, BB2);
Match->addIncoming(MatchInit, BB1);
Match->addIncoming(MatchAcc, BB2);
// These are needed to retain LCSSA form.
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
index 0ad9f1dc4c859..32dc1f5cbb4c4 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -23,8 +23,10 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -35,18 +37,18 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0:![0-9]+]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -55,13 +57,13 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -69,9 +71,9 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
@@ -168,7 +170,8 @@ exit:
}
; Equivalent to @find_first_of_i8 but with i16.
-; This is accepted and generates a similar loop.
+; This is currently not accepted, but could be with more precise trip count
+; handling.
define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
; CHECK-LABEL: define ptr @find_first_of_i16(
; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
@@ -178,63 +181,9 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
; CHECK: [[HEADER_PREHEADER]]:
-; CHECK-NEXT: br label %[[MEM_CHECK:.*]]
-; CHECK: [[MEM_CHECK]]:
-; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
-; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
-; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
-; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
-; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
-; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
-; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
-; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
-; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
-; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
-; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
-; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
-; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PSEARCH]], <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
-; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
-; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PNEEDLE]], <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
-; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], i64 0
-; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[NEEDLE0]], i64 0
-; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], <vscale x 8 x i16> [[NEEDLE0_SPLAT]]
-; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[NEEDLE_SPLAT]], i64 0)
-; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], <vscale x 8 x i1> [[SEARCH_MASKED]])
-; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 8 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
-; CHECK: [[MATCH_CHECK_VEC]]:
-; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
-; CHECK: [[CALCULATE_MATCH]]:
-; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
-; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
-; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[MATCH_VEC]], i1 true)
-; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
-; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
-; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
-; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
-; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
+; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1
; CHECK-NEXT: br label %[[MATCH_CHECK:.*]]
; CHECK: [[NEEDLE_CHECK:.*]]:
@@ -245,13 +194,13 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1
; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
-; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]]
+; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]]
; CHECK: [[SEARCH_CHECK]]:
; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1
; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]]
+; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]]
; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
+; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
; CHECK-NEXT: br label %[[EXIT]]
; CHECK: [[EXIT_LOOPEXIT1]]:
; CHECK-NEXT: br label %[[EXIT]]
@@ -341,8 +290,10 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -353,18 +304,18 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -373,13 +324,13 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -387,9 +338,9 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_SUCC:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]]
+; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
@@ -507,8 +458,10 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -519,18 +472,18 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -539,13 +492,13 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -553,9 +506,9 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[FOUND_MATCH:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
+; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
@@ -661,8 +614,10 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -673,18 +628,18 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -693,13 +648,13 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -707,9 +662,9 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[FOUND_MATCH:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
+; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
>From 3b82c41e710fad849e2953aa061d72549915e744 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 9 Feb 2026 08:07:18 -0800
Subject: [PATCH 3/4] Revert "Avoid pointer wrap."
This reverts commit ed76893c03c4452c8aba0cb9434ba39382b125fe.
---
.../Vectorize/LoopIdiomVectorize.cpp | 53 ++---
.../LoopIdiom/AArch64/find-first-byte.ll | 185 +++++++++++-------
2 files changed, 134 insertions(+), 104 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index d76df8f417e1c..d052eb4201dfe 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -1069,10 +1069,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
!cast<LoadInst>(LoadNeedle)->isSimple())
return false;
- // Check we are loading valid characters (currently limited to i8).
- // Other types could be accepted but require more precise trip count handling.
+ // Check we are loading valid characters.
Type *CharTy = LoadSearch->getType();
- if (!CharTy->isIntegerTy(8) || LoadNeedle->getType() != CharTy)
+ if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy)
return false;
// Pick the vectorisation factor based on CharTy, work out the cost of the
@@ -1087,13 +1086,6 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
return false;
- // We exclude loops with trip counts > minimum page size via runtime checks.
- // As MinPageSize - 1 + VF is an upper bound on the maximum value the loops'
- // induction variables may take, so long as this expression doesn't wrap with
- // 64-bit arithmetic, the IVs can't wrap either.
- if (uint64_t(*TTI->getMinPageSize() - 1) + VF < VF)
- return false;
-
// The loads come from two PHIs, each with two incoming values.
PHINode *PSearch = dyn_cast<PHINode>(Search);
PHINode *PNeedle = dyn_cast<PHINode>(Needle);
@@ -1261,16 +1253,10 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int");
Value *ISearchEnd =
Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int");
- Value *SearchIdxInit = Constant::getNullValue(I64Ty);
- Value *SearchTripCount =
- Builder.CreateSub(ISearchEnd, ISearchStart, "search_trip_count");
Value *INeedleStart =
Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int");
Value *INeedleEnd =
Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int");
- Value *NeedleIdxInit = Constant::getNullValue(I64Ty);
- Value *NeedleTripCount =
- Builder.CreateSub(INeedleEnd, INeedleStart, "needle_trip_count");
Value *PredVF =
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
{ConstantInt::get(I64Ty, 0), ConstVF});
@@ -1300,12 +1286,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (1) Load the search array and branch to the inner loop.
Builder.SetInsertPoint(BB1);
- PHINode *SearchIdx = Builder.CreatePHI(I64Ty, 2, "search_idx");
+ PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
Value *PredSearch = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {SearchIdx, SearchTripCount}, nullptr, "search_pred");
+ {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr,
+ "search_pred");
PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
- Value *Search = Builder.CreateGEP(CharTy, SearchStart, SearchIdx, "psearch");
Value *LoadSearch = Builder.CreateMaskedLoad(
CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
Value *MatchInit = Constant::getNullValue(PredVTy);
@@ -1314,15 +1300,15 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (2) Inner loop.
Builder.SetInsertPoint(BB2);
- PHINode *NeedleIdx = Builder.CreatePHI(I64Ty, 2, "needle_idx");
+ PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
PHINode *Match = Builder.CreatePHI(PredVTy, 2, "pmatch");
// (2.a) Load the needle array.
Value *PredNeedle = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {NeedleIdx, NeedleTripCount}, nullptr, "needle_pred");
+ {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr,
+ "needle_pred");
PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked");
- Value *Needle = Builder.CreateGEP(CharTy, NeedleStart, NeedleIdx, "pneedle");
Value *LoadNeedle = Builder.CreateMaskedLoad(
CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec");
@@ -1341,10 +1327,9 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
{LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_segment");
Value *MatchAcc = Builder.CreateOr(Match, MatchSeg, "match_accumulator");
- Value *NextNeedleIdx =
- Builder.CreateAdd(NeedleIdx, ConstVF, "needle_idx_next");
- Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedleIdx, NeedleTripCount),
- BB2, BB3);
+ Value *NextNeedle =
+ Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB3);
DTU.applyUpdates(
{{DominatorTree::Insert, BB2, BB2}, {DominatorTree::Insert, BB2, BB3}});
@@ -1372,18 +1357,18 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (5) Check if we've reached the end of the search array.
Builder.SetInsertPoint(BB5);
- Value *NextSearchIdx =
- Builder.CreateAdd(SearchIdx, ConstVF, "search_idx_next");
- Builder.CreateCondBr(Builder.CreateICmpULT(NextSearchIdx, SearchTripCount),
- BB1, ExitFail);
+ Value *NextSearch =
+ Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec");
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
+ ExitFail);
DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
{DominatorTree::Insert, BB5, ExitFail}});
// Set up the PHI nodes.
- SearchIdx->addIncoming(SearchIdxInit, BB0);
- SearchIdx->addIncoming(NextSearchIdx, BB5);
- NeedleIdx->addIncoming(NeedleIdxInit, BB1);
- NeedleIdx->addIncoming(NextNeedleIdx, BB2);
+ Search->addIncoming(SearchStart, BB0);
+ Search->addIncoming(NextSearch, BB5);
+ Needle->addIncoming(NeedleStart, BB1);
+ Needle->addIncoming(NextNeedle, BB2);
Match->addIncoming(MatchInit, BB1);
Match->addIncoming(MatchAcc, BB2);
// These are needed to retain LCSSA form.
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
index 32dc1f5cbb4c4..0ad9f1dc4c859 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -23,10 +23,8 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
-; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
-; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -37,18 +35,18 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0:![0-9]+]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -57,13 +55,13 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -71,9 +69,9 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
@@ -170,8 +168,7 @@ exit:
}
; Equivalent to @find_first_of_i8 but with i16.
-; This is currently not accepted, but could be with more precise trip count
-; handling.
+; This is accepted and generates a similar loop.
define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 {
; CHECK-LABEL: define ptr @find_first_of_i16(
; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] {
@@ -181,9 +178,63 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]]
; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]]
; CHECK: [[HEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[MEM_CHECK:.*]]
+; CHECK: [[MEM_CHECK]]:
+; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
+; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
+; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
+; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
+; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
+; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12
+; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12
+; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]]
+; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]]
+; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
+; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
+; CHECK: [[FIND_FIRST_VEC_HEADER]]:
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
+; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
+; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PSEARCH]], <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
+; CHECK: [[NEEDLE_CHECK_VEC]]:
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
+; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
+; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PNEEDLE]], <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[NEEDLE0]], i64 0
+; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], <vscale x 8 x i16> [[NEEDLE0_SPLAT]]
+; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[NEEDLE_SPLAT]], i64 0)
+; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], <vscale x 8 x i1> [[SEARCH_MASKED]])
+; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 8 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK: [[MATCH_CHECK_VEC]]:
+; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK: [[CALCULATE_MATCH]]:
+; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 8 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
+; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[MATCH_VEC]], i1 true)
+; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
+; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK: [[SEARCH_CHECK_VEC]]:
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]]
+; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
-; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ]
+; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ]
; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1
; CHECK-NEXT: br label %[[MATCH_CHECK:.*]]
; CHECK: [[NEEDLE_CHECK:.*]]:
@@ -194,13 +245,13 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ]
; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1
; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]]
-; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]]
+; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]]
; CHECK: [[SEARCH_CHECK]]:
; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1
; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]]
-; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]]
+; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]]
; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ]
+; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ]
; CHECK-NEXT: br label %[[EXIT]]
; CHECK: [[EXIT_LOOPEXIT1]]:
; CHECK-NEXT: br label %[[EXIT]]
@@ -290,10 +341,8 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
-; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
-; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -304,18 +353,18 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -324,13 +373,13 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -338,9 +387,9 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[EXIT_SUCC:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]]
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
@@ -458,10 +507,8 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
-; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
-; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -472,18 +519,18 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -492,13 +539,13 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -506,9 +553,9 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[FOUND_MATCH:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
@@ -614,10 +661,8 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK: [[MEM_CHECK]]:
; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64
; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64
-; CHECK-NEXT: [[SEARCH_TRIP_COUNT:%.*]] = sub i64 [[SEARCH_END_INT]], [[SEARCH_START_INT]]
; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64
; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64
-; CHECK-NEXT: [[NEEDLE_TRIP_COUNT:%.*]] = sub i64 [[NEEDLE_END_INT]], [[NEEDLE_START_INT]]
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12
; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12
@@ -628,18 +673,18 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]]
; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]]
; CHECK: [[FIND_FIRST_VEC_HEADER]]:
-; CHECK-NEXT: [[SEARCH_IDX:%.*]] = phi i64 [ 0, %[[MEM_CHECK]] ], [ [[SEARCH_IDX_NEXT:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
-; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[SEARCH_IDX]], i64 [[SEARCH_TRIP_COUNT]])
+; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
+; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT: [[PSEARCH:%.*]] = getelementptr i8, ptr [[SEARCH_START]], i64 [[SEARCH_IDX]]
; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: br label %[[NEEDLE_CHECK_VEC:.*]]
; CHECK: [[NEEDLE_CHECK_VEC]]:
-; CHECK-NEXT: [[NEEDLE_IDX:%.*]] = phi i64 [ 0, %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_IDX_NEXT:%.*]], %[[NEEDLE_CHECK_VEC]] ]
+; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC]] ]
; CHECK-NEXT: [[PMATCH:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[FIND_FIRST_VEC_HEADER]] ], [ [[MATCH_ACCUMULATOR:%.*]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[NEEDLE_IDX]], i64 [[NEEDLE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
+; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT: [[PNEEDLE:%.*]] = getelementptr i8, ptr [[NEEDLE_START]], i64 [[NEEDLE_IDX]]
; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
@@ -648,13 +693,13 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[NEEDLE_SPLAT]], i64 0)
; CHECK-NEXT: [[MATCH_SEGMENT:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], <vscale x 16 x i1> [[SEARCH_MASKED]])
; CHECK-NEXT: [[MATCH_ACCUMULATOR]] = or <vscale x 16 x i1> [[PMATCH]], [[MATCH_SEGMENT]]
-; CHECK-NEXT: [[NEEDLE_IDX_NEXT]] = add i64 [[NEEDLE_IDX]], 16
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[NEEDLE_IDX_NEXT]], [[NEEDLE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
+; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[NEEDLE_CHECK_VEC]], label %[[MATCH_CHECK_VEC:.*]]
; CHECK: [[MATCH_CHECK_VEC]]:
; CHECK-NEXT: [[MATCH_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_ACCUMULATOR]], %[[NEEDLE_CHECK_VEC]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
-; CHECK-NEXT: br i1 [[TMP2]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[MATCH_PRED]])
+; CHECK-NEXT: br i1 [[TMP4]], label %[[CALCULATE_MATCH:.*]], label %[[SEARCH_CHECK_VEC]]
; CHECK: [[CALCULATE_MATCH]]:
; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ]
; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi <vscale x 16 x i1> [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ]
@@ -662,9 +707,9 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]]
; CHECK-NEXT: br label %[[FOUND_MATCH:.*]]
; CHECK: [[SEARCH_CHECK_VEC]]:
-; CHECK-NEXT: [[SEARCH_IDX_NEXT]] = add i64 [[SEARCH_IDX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[SEARCH_IDX_NEXT]], [[SEARCH_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
+; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[NOT_FOUND:.*]]
; CHECK: [[SCALAR_PREHEADER]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
>From c52fd115e153d0d2060900d8e12cc4ae09808a0d Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 10 Feb 2026 03:22:27 -0800
Subject: [PATCH 4/4] Rename MatchPred to MatchPredAccLCSSA.
---
.../lib/Transforms/Vectorize/LoopIdiomVectorize.cpp | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index d052eb4201dfe..a420abfae54be 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -1335,8 +1335,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (3) Check if we found a match.
Builder.SetInsertPoint(BB3);
- PHINode *MatchPred = Builder.CreatePHI(PredVTy, 1, "match_pred");
- Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+ PHINode *MatchPredAccLCSSA = Builder.CreatePHI(PredVTy, 1, "match_pred");
+ Value *IfAnyMatch = Builder.CreateOrReduce(MatchPredAccLCSSA);
Builder.CreateCondBr(IfAnyMatch, BB4, BB5);
DTU.applyUpdates(
{{DominatorTree::Insert, BB3, BB4}, {DominatorTree::Insert, BB3, BB5}});
@@ -1344,10 +1344,9 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// (4) We found a match. Compute the index of its location and exit.
Builder.SetInsertPoint(BB4);
PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start");
- PHINode *MatchPredLCSSA =
- Builder.CreatePHI(MatchPred->getType(), 1, "match_vec");
+ PHINode *MatchPredLCSSA = Builder.CreatePHI(PredVTy, 1, "match_vec");
Value *MatchCnt = Builder.CreateIntrinsic(
- Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
+ Intrinsic::experimental_cttz_elts, {I64Ty, PredVTy},
{MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr,
"match_idx");
Value *MatchVal =
@@ -1372,9 +1371,9 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Match->addIncoming(MatchInit, BB1);
Match->addIncoming(MatchAcc, BB2);
// These are needed to retain LCSSA form.
- MatchPred->addIncoming(MatchAcc, BB2);
+ MatchPredAccLCSSA->addIncoming(MatchAcc, BB2);
MatchLCSSA->addIncoming(Search, BB3);
- MatchPredLCSSA->addIncoming(MatchPred, BB3);
+ MatchPredLCSSA->addIncoming(MatchPredAccLCSSA, BB3);
// Ensure all Phis in the successors of BB4/BB5 have an incoming value from
// them.
More information about the llvm-commits
mailing list