[llvm] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (PR #101976)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 28 08:14:10 PST 2024
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/101976
>From f2e0f08bb333c8560615b5e11d73827bb2f781b7 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 15 Jul 2024 17:57:30 +0100
Subject: [PATCH 1/2] [AArch64] Add MATCH loops to LoopIdiomVectorizePass
This patch adds a new loop to LoopIdiomVectorizePass, enabling it to
recognise and use @llvm.experimental.vector.match to vectorise loops
such as:
char* find_first_of(char *first, char *last,
char *s_first, char *s_last) {
for (; first != last; ++first)
for (char *it = s_first; it != s_last; ++it)
if (*first == *it)
return first;
return last;
}
These loops match the C++ standard library's std::find_first_of.
---
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-
.../AArch64/AArch64TargetTransformInfo.cpp | 17 +
.../Vectorize/LoopIdiomVectorize.cpp | 425 +++++++++++++++++-
llvm/test/CodeGen/AArch64/find-first-byte.ll | 123 +++++
4 files changed, 559 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/find-first-byte.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 72038c090b7922..d5c76f7f6a5ee2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -775,7 +775,9 @@ class TargetTransformInfoImplBase {
default:
break;
case Intrinsic::experimental_vector_histogram_add:
- // For now, we want explicit support from the target for histograms.
+ case Intrinsic::experimental_vector_match:
+ // For now, we want explicit support from the target for histograms and
+ // matches.
return InstructionCost::getInvalid();
case Intrinsic::allow_runtime_check:
case Intrinsic::allow_ubsan_check:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ec7bb71fd111ff..839563e31cb87e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -914,6 +914,23 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_match: {
+ EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ unsigned SearchSize =
+ cast<FixedVectorType>(ICA.getArgTypes()[1])->getNumElements();
+ // If we can't lower to MATCH, return an invalid cost.
+ if (getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
+ return InstructionCost::getInvalid();
+ // Base cost for MATCH instructions. At least on the Neoverse V2 and
+ // Neoverse V3 these are cheap operations with the same latency as a vector
+ // ADD, though in most cases we also need to do an extra DUP.
+ InstructionCost Cost = 4;
+ // For fixed-length vectors we currently need an extra five--six
+ // instructions besides the MATCH.
+ if (isa<FixedVectorType>(RetTy))
+ Cost += 6;
+ return Cost;
+ }
default:
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 7af7408ed67a8c..dbc2f55e2c0ec8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -10,8 +10,10 @@
// transforms them into more optimized versions of the same loop. In cases
// where this happens, it can be a significant performance win.
//
-// We currently only recognize one loop that finds the first mismatched byte
-// in an array and returns the index, i.e. something like:
+// We currently support two loops:
+//
+// 1. A loop that finds the first mismatched byte in an array and returns the
+// index, i.e. something like:
//
// while (++i != n) {
// if (a[i] != b[i])
@@ -24,12 +26,6 @@
// boundaries. However, even with these checks it is still profitable to do the
// transformation.
//
-//===----------------------------------------------------------------------===//
-//
-// NOTE: This Pass matches a really specific loop pattern because it's only
-// supposed to be a temporary solution until our LoopVectorizer is powerful
-// enought to vectorize it automatically.
-//
// TODO List:
//
// * Add support for the inverse case where we scan for a matching element.
@@ -37,6 +33,35 @@
// * Recognize loops that increment the IV *after* comparing bytes.
// * Allow 32-bit sign-extends of the IV used by the GEP.
//
+// 2. A loop that finds the first matching character in an array among a set of
+// possible matches, e.g.:
+//
+// for (; first != last; ++first)
+// for (s_it = s_first; s_it != s_last; ++s_it)
+// if (*first == *s_it)
+// return first;
+// return last;
+//
+// This corresponds to std::find_first_of (for arrays of bytes) from the C++
+// standard library. This function can be implemented efficiently for targets
+// that support @llvm.experimental.vector.match. For example, on AArch64 targets
+// that implement SVE2, this lower to a MATCH instruction, which enables us to
+// perform up to 16x16=256 comparisons in one go. This can lead to very
+// significant speedups.
+//
+// TODO:
+//
+// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison).
+// * Make VF a configurable parameter (right now we assume 128-bit vectors).
+// * Potentially adjust the cost model to let the transformation kick-in even if
+// @llvm.experimental.vector.match doesn't have direct support in hardware.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: This Pass matches really specific loop patterns because it's only
+// supposed to be a temporary solution until our LoopVectorizer is powerful
+// enought to vectorize them automatically.
+//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
@@ -79,6 +104,12 @@ static cl::opt<unsigned>
cl::desc("The vectorization factor for byte-compare patterns."),
cl::init(16));
+static cl::opt<bool>
+ DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
+ cl::Hidden, cl::init(false),
+ cl::desc("Proceed with Loop Idiom Vectorize Pass, but "
+ "do not convert find-first-byte loop(s)."));
+
static cl::opt<bool>
VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
@@ -136,6 +167,18 @@ class LoopIdiomVectorize {
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
Value *Start, bool IncIdx, BasicBlock *FoundBB,
BasicBlock *EndBB);
+
+ bool recognizeFindFirstByte();
+
+ Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+ unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
+ BasicBlock *ExitFail, Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB);
+
+ void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ Value *StartA, Value *EndA, Value *StartB,
+ Value *EndB);
/// @}
};
} // anonymous namespace
@@ -190,7 +233,13 @@ bool LoopIdiomVectorize::run(Loop *L) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
<< CurLoop->getHeader()->getName() << "\n");
- return recognizeByteCompare();
+ if (recognizeByteCompare())
+ return true;
+
+ if (recognizeFindFirstByte())
+ return true;
+
+ return false;
}
bool LoopIdiomVectorize::recognizeByteCompare() {
@@ -939,3 +988,361 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
report_fatal_error("Loops must remain in LCSSA form!");
}
}
+
+bool LoopIdiomVectorize::recognizeFindFirstByte() {
+ // Currently the transformation only works on scalable vector types, although
+ // there is no fundamental reason why it cannot be made to work for fixed
+ // vectors too.
+ if (!TTI->supportsScalableVectors() || DisableFindFirstByte)
+ return false;
+
+ // Define some constants we need throughout.
+ BasicBlock *Header = CurLoop->getHeader();
+ LLVMContext &Ctx = Header->getContext();
+
+ // We are expecting the blocks below. For now, we will bail out for almost
+ // anything other than this.
+ //
+ // Header:
+ // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
+ // %15 = load i8, ptr %14, align 1
+ // br label %MatchBB
+ //
+ // MatchBB:
+ // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
+ // %21 = load i8, ptr %20, align 1
+ // %22 = icmp eq i8 %15, %21
+ // br i1 %22, label %ExitSucc, label %InnerBB
+ //
+ // InnerBB:
+ // %17 = getelementptr inbounds i8, ptr %20, i64 1
+ // %18 = icmp eq ptr %17, %10
+ // br i1 %18, label %OuterBB, label %MatchBB
+ //
+ // OuterBB:
+ // %24 = getelementptr inbounds i8, ptr %14, i64 1
+ // %25 = icmp eq ptr %24, %6
+ // br i1 %25, label %ExitFail, label %Header
+
+ // We expect the four blocks above, which include one nested loop.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
+ CurLoop->getSubLoops().size() != 1)
+ return false;
+
+ auto *InnerLoop = CurLoop->getSubLoops().front();
+ PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
+ if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
+ return false;
+
+ // Check instruction counts.
+ auto LoopBlocks = CurLoop->getBlocks();
+ if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[1]->sizeWithoutDebug() > 4 ||
+ LoopBlocks[2]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[3]->sizeWithoutDebug() > 3)
+ return false;
+
+ // Check that no instruction other than IndPhi has outside uses.
+ for (BasicBlock *BB : LoopBlocks)
+ for (Instruction &I : *BB)
+ if (&I != IndPhi)
+ for (User *U : I.users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ return false;
+
+ // Match the branch instruction in the header. We are expecting an
+ // unconditional branch to the inner loop.
+ BasicBlock *MatchBB;
+ if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
+ !InnerLoop->contains(MatchBB))
+ return false;
+
+ // MatchBB should be the entrypoint into the inner loop containing the
+ // comparison between a search element and a needle.
+ BasicBlock *ExitSucc, *InnerBB;
+ Value *LoadA, *LoadB;
+ ICmpInst::Predicate MatchPred;
+ if (!match(MatchBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)),
+ m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+ !InnerLoop->contains(InnerBB))
+ return false;
+
+ // We expect outside uses of `IndPhi' in ExitSucc (and only there).
+ for (User *U : IndPhi->users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ if (auto *PN = dyn_cast<PHINode>(U); !PN || PN->getParent() != ExitSucc)
+ return false;
+
+ // Match the loads and check they are simple.
+ Value *A, *B;
+ if (!match(LoadA, m_Load(m_Value(A))) || !cast<LoadInst>(LoadA)->isSimple() ||
+ !match(LoadB, m_Load(m_Value(B))) || !cast<LoadInst>(LoadB)->isSimple())
+ return false;
+
+ // Check we are loading valid characters.
+ Type *CharTy = LoadA->getType();
+ if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy)
+ return false;
+
+ // Choose the vectorisation factor, work out the cost of the match intrinsic
+ // and decide if we should use it.
+ // Note: VF could be parameterised, but 128-bit vectors sounds like a good
+ // default choice for the time being.
+ unsigned VF = 128 / CharTy->getIntegerBitWidth();
+ SmallVector<Type *> Args = {
+ ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
+ ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)};
+ IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2],
+ Args);
+ if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
+ return false;
+
+ // The loads come from two PHIs, each with two incoming values.
+ PHINode *PNA = dyn_cast<PHINode>(A);
+ PHINode *PNB = dyn_cast<PHINode>(B);
+ if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB ||
+ PNB->getNumIncomingValues() != 2)
+ return false;
+
+ // One PHI comes from the outer loop (PNA), the other one from the inner loop
+ // (PNB). PNA effectively corresponds to IndPhi.
+ if (InnerLoop->contains(PNA))
+ std::swap(PNA, PNB);
+ if (PNA != &Header->front() || PNB != &MatchBB->front())
+ return false;
+
+ // The incoming values of both PHI nodes should be a gep of 1.
+ Value *StartA = PNA->getIncomingValue(0);
+ Value *IndexA = PNA->getIncomingValue(1);
+ if (CurLoop->contains(PNA->getIncomingBlock(0)))
+ std::swap(StartA, IndexA);
+
+ Value *StartB = PNB->getIncomingValue(0);
+ Value *IndexB = PNB->getIncomingValue(1);
+ if (InnerLoop->contains(PNB->getIncomingBlock(0)))
+ std::swap(StartB, IndexB);
+
+ // Match the GEPs.
+ if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) ||
+ !match(IndexB, m_GEP(m_Specific(PNB), m_One())))
+ return false;
+
+ // Check their result type matches `CharTy'.
+ GetElementPtrInst *GEPA = cast<GetElementPtrInst>(IndexA);
+ GetElementPtrInst *GEPB = cast<GetElementPtrInst>(IndexB);
+ if (GEPA->getResultElementType() != CharTy ||
+ GEPB->getResultElementType() != CharTy)
+ return false;
+
+ // InnerBB should increment the address of the needle pointer.
+ BasicBlock *OuterBB;
+ Value *EndB;
+ if (!match(InnerBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)),
+ m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(OuterBB))
+ return false;
+
+ // OuterBB should increment the address of the search element pointer.
+ BasicBlock *ExitFail;
+ Value *EndA;
+ if (!match(OuterBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)),
+ m_BasicBlock(ExitFail), m_Specific(Header))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" << *CurLoop << "\n\n");
+
+ transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, StartA, EndA,
+ StartB, EndB);
+ return true;
+}
+
+Value *LoopIdiomVectorize::expandFindFirstByte(
+ IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB) {
+ // Set up some types and constants that we intend to reuse.
+ auto *PtrTy = Builder.getPtrTy();
+ auto *I64Ty = Builder.getInt64Ty();
+ auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
+ auto *CharVTy = ScalableVectorType::get(CharTy, VF);
+ auto *ConstVF = ConstantInt::get(I64Ty, VF);
+
+ // Other common arguments.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ LLVMContext &Ctx = Preheader->getContext();
+ Value *Passthru = ConstantInt::getNullValue(CharVTy);
+
+ // Split block in the original loop preheader.
+ // SPH is the new preheader to the old scalar loop.
+ BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
+ nullptr, "scalar_ph");
+
+ // Create the blocks that we're going to use.
+ //
+ // We will have the following loops:
+ // (O) Outer loop where we iterate over the elements of the search array (A).
+ // (I) Inner loop where we iterate over the elements of the needle array (B).
+ //
+ // Overall, the blocks do the following:
+ // (1) Load a vector's worth of A. Go to (2).
+ // (2) (a) Load a vector's worth of B.
+ // (b) Splat the first element of B to the inactive lanes.
+ // (c) Check if any elements match. If so go to (3), otherwise go to (4).
+ // (3) Compute the index of the first match and exit.
+ // (4) Check if we've reached the end of B. If not loop back to (2), otherwise
+ // go to (5).
+ // (5) Check if we've reached the end of A. If not loop back to (1), otherwise
+ // exit.
+ // Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the
+ // outer and inner loops, respectively.
+ BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB4 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+ BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
+
+ // Update LoopInfo with the new loops.
+ auto OL = LI->AllocateLoop();
+ auto IL = LI->AllocateLoop();
+
+ if (auto ParentLoop = CurLoop->getParentLoop()) {
+ ParentLoop->addChildLoop(OL);
+ ParentLoop->addBasicBlockToLoop(BB3, *LI);
+ } else {
+ LI->addTopLevelLoop(OL);
+ }
+
+ // Add the inner loop to the outer.
+ OL->addChildLoop(IL);
+
+ // Add the new basic blocks to the corresponding loops.
+ OL->addBasicBlockToLoop(BB1, *LI);
+ OL->addBasicBlockToLoop(BB5, *LI);
+ IL->addBasicBlockToLoop(BB2, *LI);
+ IL->addBasicBlockToLoop(BB4, *LI);
+
+ // Set a reference to the old scalar loop and create a predicate of VF
+ // elements.
+ Builder.SetInsertPoint(Preheader->getTerminator());
+ Value *Pred16 =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {ConstantInt::get(I64Ty, 0), ConstVF});
+ Builder.CreateCondBr(Builder.getFalse(), SPH, BB1);
+ Preheader->getTerminator()->eraseFromParent();
+ DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}});
+
+ // (1) Load a vector's worth of A and branch to the inner loop.
+ Builder.SetInsertPoint(BB1);
+ PHINode *PNA = Builder.CreatePHI(PtrTy, 2, "pa");
+ Value *PredA =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePointerCast(PNA, I64Ty),
+ Builder.CreatePointerCast(EndA, I64Ty)});
+ PredA = Builder.CreateAnd(Pred16, PredA);
+ Value *LoadA =
+ Builder.CreateMaskedLoad(CharVTy, PNA, Align(1), PredA, Passthru);
+ Builder.CreateBr(BB2);
+ DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
+
+ // (2) Inner loop.
+ Builder.SetInsertPoint(BB2);
+ PHINode *PNB = Builder.CreatePHI(PtrTy, 2, "pb");
+
+ // (2.a) Load a vector's worth of B.
+ Value *PredB =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
+ {Builder.CreatePointerCast(PNB, I64Ty),
+ Builder.CreatePointerCast(EndB, I64Ty)});
+ PredB = Builder.CreateAnd(Pred16, PredB);
+ Value *LoadB =
+ Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredB, Passthru);
+
+ // (2.b) Splat the first element to the inactive lanes.
+ Value *LoadB0 = Builder.CreateExtractElement(LoadB, uint64_t(0));
+ Value *LoadB0Splat =
+ Builder.CreateVectorSplat(ElementCount::getScalable(VF), LoadB0);
+ LoadB = Builder.CreateSelect(PredB, LoadB, LoadB0Splat);
+ LoadB = Builder.CreateExtractVector(FixedVectorType::get(CharTy, VF), LoadB,
+ ConstantInt::get(I64Ty, 0));
+
+ // (2.c) Test if there's a match.
+ Value *MatchPred = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_match, {CharVTy, LoadB->getType()},
+ {LoadA, LoadB, PredA});
+ Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
+ Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}});
+
+ // (3) We found a match. Compute the index of its location and exit.
+ Builder.SetInsertPoint(BB3);
+ Value *MatchCnt = Builder.CreateIntrinsic(
+ Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
+ {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
+ Value *MatchVal = Builder.CreateGEP(CharTy, PNA, MatchCnt);
+ Builder.CreateBr(ExitSucc);
+ DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
+
+ // (4) Check if we've reached the end of B.
+ Builder.SetInsertPoint(BB4);
+ Value *IncB = Builder.CreateGEP(CharTy, PNB, ConstVF);
+ Builder.CreateCondBr(Builder.CreateICmpULT(IncB, EndB), BB2, BB5);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
+
+ // (5) Check if we've reached the end of A.
+ Builder.SetInsertPoint(BB5);
+ Value *IncA = Builder.CreateGEP(CharTy, PNA, ConstVF);
+ Builder.CreateCondBr(Builder.CreateICmpULT(IncA, EndA), BB1, ExitFail);
+ DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
+ {DominatorTree::Insert, BB5, ExitFail}});
+
+ // Set up the PHI's.
+ PNA->addIncoming(StartA, Preheader);
+ PNA->addIncoming(IncA, BB5);
+ PNB->addIncoming(StartB, BB1);
+ PNB->addIncoming(IncB, BB4);
+
+ if (VerifyLoops) {
+ OL->verifyLoop();
+ IL->verifyLoop();
+ if (!OL->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+
+ return MatchVal;
+}
+
+void LoopIdiomVectorize::transformFindFirstByte(PHINode *IndPhi, unsigned VF,
+ Type *CharTy,
+ BasicBlock *ExitSucc,
+ BasicBlock *ExitFail,
+ Value *StartA, Value *EndA,
+ Value *StartB, Value *EndB) {
+ // Insert the find first byte code at the end of the preheader block.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ IRBuilder<> Builder(PHBranch);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+ Value *MatchVal = expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc,
+ ExitFail, StartA, EndA, StartB, EndB);
+
+ // Add new incoming values with the result of the transformation to PHINodes
+ // of ExitSucc that use IndPhi.
+ for (auto *U : llvm::make_early_inc_range(IndPhi->users()))
+ if (auto *PN = dyn_cast<PHINode>(U); PN && PN->getParent() == ExitSucc)
+ PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
+
+ if (VerifyLoops && CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->verifyLoop();
+ if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+}
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
new file mode 100644
index 00000000000000..e60553e95e13cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -0,0 +1,123 @@
+; RUN: opt -mattr=+sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize)' -S < %s | FileCheck -check-prefix=SVE2 %s
+; RUN: opt -mattr=-sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize)' -S < %s | FileCheck -check-prefix=NOSVE2 %s
+
+; Base case based on `libcxx/include/__algorithm/find_first_of.h':
+; char* find_first_of(char *first, char *last, char *s_first, char *s_last) {
+; for (; first != last; ++first)
+; for (char *it = s_first; it != s_last; ++it)
+; if (*first == *it)
+; return first;
+; return last;
+; }
+define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; SVE2-LABEL: define ptr @find_first_of_i8(
+; SVE2: {{%.*}} = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> {{%.*}}, <16 x i8> {{%.*}}, <vscale x 16 x i1> {{%.*}})
+;
+; NOSVE2-LABEL: define ptr @find_first_of_i8(
+; NOSVE2-NOT: {{%.*}} = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> {{%.*}}, <16 x i8> {{%.*}}, <vscale x 16 x i1> {{%.*}})
+;
+ %5 = icmp eq ptr %0, %1
+ %6 = icmp eq ptr %2, %3
+ %7 = or i1 %5, %6
+ br i1 %7, label %21, label %8
+
+8:
+ %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+ %10 = load i8, ptr %9, align 1
+ br label %14
+
+11:
+ %12 = getelementptr inbounds i8, ptr %15, i64 1
+ %13 = icmp eq ptr %12, %3
+ br i1 %13, label %18, label %14
+
+14:
+ %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+ %16 = load i8, ptr %15, align 1
+ %17 = icmp eq i8 %10, %16
+ br i1 %17, label %21, label %11
+
+18:
+ %19 = getelementptr inbounds i8, ptr %9, i64 1
+ %20 = icmp eq ptr %19, %1
+ br i1 %20, label %21, label %8
+
+21:
+ %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+ ret ptr %22
+}
+
+; Same as @find_first_of_i8 but with i16.
+define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; SVE2-LABEL: define ptr @find_first_of_i16(
+; SVE2: {{%.*}} = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> {{%.*}}, <8 x i16> {{%.*}}, <vscale x 8 x i1> {{%.*}})
+;
+ %5 = icmp eq ptr %0, %1
+ %6 = icmp eq ptr %2, %3
+ %7 = or i1 %5, %6
+ br i1 %7, label %21, label %8
+
+8:
+ %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+ %10 = load i16, ptr %9, align 1
+ br label %14
+
+11:
+ %12 = getelementptr inbounds i16, ptr %15, i64 1
+ %13 = icmp eq ptr %12, %3
+ br i1 %13, label %18, label %14
+
+14:
+ %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+ %16 = load i16, ptr %15, align 1
+ %17 = icmp eq i16 %10, %16
+ br i1 %17, label %21, label %11
+
+18:
+ %19 = getelementptr inbounds i16, ptr %9, i64 1
+ %20 = icmp eq ptr %19, %1
+ br i1 %20, label %21, label %8
+
+21:
+ %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+ ret ptr %22
+}
+
+; Same as @find_first_of_i8 but with `ne' comparison.
+; This is rejected for now, but should eventually be supported.
+define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; SVE2-LABEL: define ptr @find_first_not_of_i8(
+; SVE2-NOT: {{%.*}} = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> {{%.*}}, <16 x i8> {{%.*}}, <vscale x 16 x i1> {{%.*}})
+;
+ %5 = icmp eq ptr %0, %1
+ %6 = icmp eq ptr %2, %3
+ %7 = or i1 %5, %6
+ br i1 %7, label %21, label %8
+
+8:
+ %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+ %10 = load i8, ptr %9, align 1
+ br label %14
+
+11:
+ %12 = getelementptr inbounds i8, ptr %15, i64 1
+ %13 = icmp eq ptr %12, %3
+ br i1 %13, label %18, label %14
+
+14:
+ %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+ %16 = load i8, ptr %15, align 1
+ %17 = icmp ne i8 %10, %16
+ br i1 %17, label %21, label %11
+
+18:
+ %19 = getelementptr inbounds i8, ptr %9, i64 1
+ %20 = icmp eq ptr %19, %1
+ br i1 %20, label %21, label %8
+
+21:
+ %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+ ret ptr %22
+}
+
+attributes #0 = { "target-features"="+sve2" }
>From a3292d15186dafa71c02af1ae00bd7ff59e18c69 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 28 Nov 2024 15:20:13 +0000
Subject: [PATCH 2/2] Address comments
Among other changes:
* Make AArch64TTIImpl::getIntrinsicInstrCost return Invalid for
fixed-length vectors
* Match whole IR in tests and add a negative test for outside uses of
variables that are not rewritten by the transformation
* Rename a few variables to Search/Needle
---
.../AArch64/AArch64TargetTransformInfo.cpp | 12 +-
.../Vectorize/LoopIdiomVectorize.cpp | 204 ++++++-----
llvm/test/CodeGen/AArch64/find-first-byte.ll | 328 +++++++++++++++++-
3 files changed, 424 insertions(+), 120 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 839563e31cb87e..d2823dbc7012a5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -921,15 +921,15 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// If we can't lower to MATCH, return an invalid cost.
if (getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
return InstructionCost::getInvalid();
+ // We could technically lower some fixed-length vectors to MATCH, which
+ // would currently need an extra five--six instructions. However, we don't
+ // have a use-case for this currently, and so we mark it as invalid.
+ if (isa<FixedVectorType>(RetTy))
+ return InstructionCost::getInvalid();
// Base cost for MATCH instructions. At least on the Neoverse V2 and
// Neoverse V3 these are cheap operations with the same latency as a vector
// ADD, though in most cases we also need to do an extra DUP.
- InstructionCost Cost = 4;
- // For fixed-length vectors we currently need an extra five--six
- // instructions besides the MATCH.
- if (isa<FixedVectorType>(RetTy))
- Cost += 6;
- return Cost;
+ return InstructionCost(4);
}
default:
break;
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index dbc2f55e2c0ec8..a874dd9f8f181b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -107,8 +107,7 @@ static cl::opt<unsigned>
static cl::opt<bool>
DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
cl::Hidden, cl::init(false),
- cl::desc("Proceed with Loop Idiom Vectorize Pass, but "
- "do not convert find-first-byte loop(s)."));
+ cl::desc("Do not convert find-first-byte loop(s)."));
static cl::opt<bool>
VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
@@ -172,13 +171,14 @@ class LoopIdiomVectorize {
Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
- BasicBlock *ExitFail, Value *StartA, Value *EndA,
- Value *StartB, Value *EndB);
+ BasicBlock *ExitFail, Value *SearchStart,
+ Value *SearchEnd, Value *NeedleStart,
+ Value *NeedleEnd);
void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
- Value *StartA, Value *EndA, Value *StartB,
- Value *EndB);
+ Value *SearchStart, Value *SearchEnd,
+ Value *NeedleStart, Value *NeedleEnd);
/// @}
};
} // anonymous namespace
@@ -1000,31 +1000,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
BasicBlock *Header = CurLoop->getHeader();
LLVMContext &Ctx = Header->getContext();
- // We are expecting the blocks below. For now, we will bail out for almost
- // anything other than this.
- //
- // Header:
- // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
- // %15 = load i8, ptr %14, align 1
- // br label %MatchBB
- //
- // MatchBB:
- // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
- // %21 = load i8, ptr %20, align 1
- // %22 = icmp eq i8 %15, %21
- // br i1 %22, label %ExitSucc, label %InnerBB
- //
- // InnerBB:
- // %17 = getelementptr inbounds i8, ptr %20, i64 1
- // %18 = icmp eq ptr %17, %10
- // br i1 %18, label %OuterBB, label %MatchBB
- //
- // OuterBB:
- // %24 = getelementptr inbounds i8, ptr %14, i64 1
- // %25 = icmp eq ptr %24, %6
- // br i1 %25, label %ExitFail, label %Header
-
- // We expect the four blocks above, which include one nested loop.
+ // We are expecting the four blocks defined below: Header, MatchBB, InnerBB,
+ // and OuterBB. For now, we will bail our for almost anything else. The Four
+ // blocks contain one nested loop.
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
CurLoop->getSubLoops().size() != 1)
return false;
@@ -1052,6 +1030,11 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
// Match the branch instruction in the header. We are expecting an
// unconditional branch to the inner loop.
+ //
+ // Header:
+ // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
+ // %15 = load i8, ptr %14, align 1
+ // br label %MatchBB
BasicBlock *MatchBB;
if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
!InnerLoop->contains(MatchBB))
@@ -1059,6 +1042,12 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
// MatchBB should be the entrypoint into the inner loop containing the
// comparison between a search element and a needle.
+ //
+ // MatchBB:
+ // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
+ // %21 = load i8, ptr %20, align 1
+ // %22 = icmp eq i8 %15, %21
+ // br i1 %22, label %ExitSucc, label %InnerBB
BasicBlock *ExitSucc, *InnerBB;
Value *LoadA, *LoadB;
ICmpInst::Predicate MatchPred;
@@ -1086,10 +1075,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy)
return false;
- // Choose the vectorisation factor, work out the cost of the match intrinsic
- // and decide if we should use it.
- // Note: VF could be parameterised, but 128-bit vectors sounds like a good
- // default choice for the time being.
+ // Pick the vectorisation factor based on CharTy, work out the cost of the
+ // match intrinsic and decide if we should use it.
+ // Note: For the time being we assume 128-bit vectors.
unsigned VF = 128 / CharTy->getIntegerBitWidth();
SmallVector<Type *> Args = {
ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
@@ -1129,7 +1117,7 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
!match(IndexB, m_GEP(m_Specific(PNB), m_One())))
return false;
- // Check their result type matches `CharTy'.
+ // Check the GEPs result type matches `CharTy'.
GetElementPtrInst *GEPA = cast<GetElementPtrInst>(IndexA);
GetElementPtrInst *GEPB = cast<GetElementPtrInst>(IndexB);
if (GEPA->getResultElementType() != CharTy ||
@@ -1137,6 +1125,11 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
return false;
// InnerBB should increment the address of the needle pointer.
+ //
+ // InnerBB:
+ // %17 = getelementptr inbounds i8, ptr %20, i64 1
+ // %18 = icmp eq ptr %17, %10
+ // br i1 %18, label %OuterBB, label %MatchBB
BasicBlock *OuterBB;
Value *EndB;
if (!match(InnerBB->getTerminator(),
@@ -1146,6 +1139,11 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
return false;
// OuterBB should increment the address of the search element pointer.
+ //
+ // OuterBB:
+ // %24 = getelementptr inbounds i8, ptr %14, i64 1
+ // %25 = icmp eq ptr %24, %6
+ // br i1 %25, label %ExitFail, label %Header
BasicBlock *ExitFail;
Value *EndA;
if (!match(OuterBB->getTerminator(),
@@ -1154,7 +1152,7 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
MatchPred != ICmpInst::Predicate::ICMP_EQ)
return false;
- LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" << *CurLoop << "\n\n");
+ LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, StartA, EndA,
StartB, EndB);
@@ -1163,8 +1161,8 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
Value *LoopIdiomVectorize::expandFindFirstByte(
IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
- BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *StartA, Value *EndA,
- Value *StartB, Value *EndB) {
+ BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
+ Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
// Set up some types and constants that we intend to reuse.
auto *PtrTy = Builder.getPtrTy();
auto *I64Ty = Builder.getInt64Ty();
@@ -1185,19 +1183,19 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
// Create the blocks that we're going to use.
//
// We will have the following loops:
- // (O) Outer loop where we iterate over the elements of the search array (A).
- // (I) Inner loop where we iterate over the elements of the needle array (B).
+ // (O) Outer loop where we iterate over the elements of the search array.
+ // (I) Inner loop where we iterate over the elements of the needle array.
//
// Overall, the blocks do the following:
- // (1) Load a vector's worth of A. Go to (2).
- // (2) (a) Load a vector's worth of B.
- // (b) Splat the first element of B to the inactive lanes.
+ // (1) Load the search array. Go to (2).
+ // (2) (a) Load the needle array.
+ // (b) Splat the first element to the inactive lanes.
// (c) Check if any elements match. If so go to (3), otherwise go to (4).
// (3) Compute the index of the first match and exit.
- // (4) Check if we've reached the end of B. If not loop back to (2), otherwise
- // go to (5).
- // (5) Check if we've reached the end of A. If not loop back to (1), otherwise
- // exit.
+ // (4) Check if we've reached the end of the needle array. If not loop back to
+ // (2), otherwise go to (5).
+ // (5) Check if we've reached the end of the search array. If not loop back to
+ // (1), otherwise exit.
// Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the
// outer and inner loops, respectively.
BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
@@ -1207,24 +1205,24 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH);
// Update LoopInfo with the new loops.
- auto OL = LI->AllocateLoop();
- auto IL = LI->AllocateLoop();
+ auto OuterLoop = LI->AllocateLoop();
+ auto InnerLoop = LI->AllocateLoop();
if (auto ParentLoop = CurLoop->getParentLoop()) {
- ParentLoop->addChildLoop(OL);
+ ParentLoop->addChildLoop(OuterLoop);
ParentLoop->addBasicBlockToLoop(BB3, *LI);
} else {
- LI->addTopLevelLoop(OL);
+ LI->addTopLevelLoop(OuterLoop);
}
// Add the inner loop to the outer.
- OL->addChildLoop(IL);
+ OuterLoop->addChildLoop(InnerLoop);
// Add the new basic blocks to the corresponding loops.
- OL->addBasicBlockToLoop(BB1, *LI);
- OL->addBasicBlockToLoop(BB5, *LI);
- IL->addBasicBlockToLoop(BB2, *LI);
- IL->addBasicBlockToLoop(BB4, *LI);
+ OuterLoop->addBasicBlockToLoop(BB1, *LI);
+ OuterLoop->addBasicBlockToLoop(BB5, *LI);
+ InnerLoop->addBasicBlockToLoop(BB2, *LI);
+ InnerLoop->addBasicBlockToLoop(BB4, *LI);
// Set a reference to the old scalar loop and create a predicate of VF
// elements.
@@ -1236,44 +1234,44 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Preheader->getTerminator()->eraseFromParent();
DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}});
- // (1) Load a vector's worth of A and branch to the inner loop.
+ // (1) Load the search array and branch to the inner loop.
Builder.SetInsertPoint(BB1);
- PHINode *PNA = Builder.CreatePHI(PtrTy, 2, "pa");
- Value *PredA =
+ PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
+ Value *PredSearch =
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePointerCast(PNA, I64Ty),
- Builder.CreatePointerCast(EndA, I64Ty)});
- PredA = Builder.CreateAnd(Pred16, PredA);
- Value *LoadA =
- Builder.CreateMaskedLoad(CharVTy, PNA, Align(1), PredA, Passthru);
+ {Builder.CreatePointerCast(Search, I64Ty),
+ Builder.CreatePointerCast(SearchEnd, I64Ty)});
+ PredSearch = Builder.CreateAnd(Pred16, PredSearch);
+ Value *LoadSearch =
+ Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru);
Builder.CreateBr(BB2);
DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
// (2) Inner loop.
Builder.SetInsertPoint(BB2);
- PHINode *PNB = Builder.CreatePHI(PtrTy, 2, "pb");
+ PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
- // (2.a) Load a vector's worth of B.
- Value *PredB =
+ // (2.a) Load the needle array.
+ Value *PredNeedle =
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
- {Builder.CreatePointerCast(PNB, I64Ty),
- Builder.CreatePointerCast(EndB, I64Ty)});
- PredB = Builder.CreateAnd(Pred16, PredB);
- Value *LoadB =
- Builder.CreateMaskedLoad(CharVTy, PNB, Align(1), PredB, Passthru);
+ {Builder.CreatePointerCast(Needle, I64Ty),
+ Builder.CreatePointerCast(NeedleEnd, I64Ty)});
+ PredNeedle = Builder.CreateAnd(Pred16, PredNeedle);
+ Value *LoadNeedle =
+ Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru);
// (2.b) Splat the first element to the inactive lanes.
- Value *LoadB0 = Builder.CreateExtractElement(LoadB, uint64_t(0));
- Value *LoadB0Splat =
- Builder.CreateVectorSplat(ElementCount::getScalable(VF), LoadB0);
- LoadB = Builder.CreateSelect(PredB, LoadB, LoadB0Splat);
- LoadB = Builder.CreateExtractVector(FixedVectorType::get(CharTy, VF), LoadB,
- ConstantInt::get(I64Ty, 0));
+ Value *Needle0 = Builder.CreateExtractElement(LoadNeedle, uint64_t(0));
+ Value *Needle0Splat =
+ Builder.CreateVectorSplat(ElementCount::getScalable(VF), Needle0);
+ LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat);
+ LoadNeedle = Builder.CreateExtractVector(
+ FixedVectorType::get(CharTy, VF), LoadNeedle, ConstantInt::get(I64Ty, 0));
// (2.c) Test if there's a match.
Value *MatchPred = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_match, {CharVTy, LoadB->getType()},
- {LoadA, LoadB, PredA});
+ Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
+ {LoadSearch, LoadNeedle, PredSearch});
Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
DTU.applyUpdates(
@@ -1284,46 +1282,45 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
Value *MatchCnt = Builder.CreateIntrinsic(
Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
{MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)});
- Value *MatchVal = Builder.CreateGEP(CharTy, PNA, MatchCnt);
+ Value *MatchVal = Builder.CreateGEP(CharTy, Search, MatchCnt);
Builder.CreateBr(ExitSucc);
DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
- // (4) Check if we've reached the end of B.
+ // (4) Check if we've reached the end of the needle array.
Builder.SetInsertPoint(BB4);
- Value *IncB = Builder.CreateGEP(CharTy, PNB, ConstVF);
- Builder.CreateCondBr(Builder.CreateICmpULT(IncB, EndB), BB2, BB5);
+ Value *NextNeedle = Builder.CreateGEP(CharTy, Needle, ConstVF);
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5);
DTU.applyUpdates(
{{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
- // (5) Check if we've reached the end of A.
+ // (5) Check if we've reached the end of the search array.
Builder.SetInsertPoint(BB5);
- Value *IncA = Builder.CreateGEP(CharTy, PNA, ConstVF);
- Builder.CreateCondBr(Builder.CreateICmpULT(IncA, EndA), BB1, ExitFail);
+ Value *NextSearch = Builder.CreateGEP(CharTy, Search, ConstVF);
+ Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
+ ExitFail);
DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
{DominatorTree::Insert, BB5, ExitFail}});
// Set up the PHI's.
- PNA->addIncoming(StartA, Preheader);
- PNA->addIncoming(IncA, BB5);
- PNB->addIncoming(StartB, BB1);
- PNB->addIncoming(IncB, BB4);
+ Search->addIncoming(SearchStart, Preheader);
+ Search->addIncoming(NextSearch, BB5);
+ Needle->addIncoming(NeedleStart, BB1);
+ Needle->addIncoming(NextNeedle, BB4);
if (VerifyLoops) {
- OL->verifyLoop();
- IL->verifyLoop();
- if (!OL->isRecursivelyLCSSAForm(*DT, *LI))
+ OuterLoop->verifyLoop();
+ InnerLoop->verifyLoop();
+ if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
report_fatal_error("Loops must remain in LCSSA form!");
}
return MatchVal;
}
-void LoopIdiomVectorize::transformFindFirstByte(PHINode *IndPhi, unsigned VF,
- Type *CharTy,
- BasicBlock *ExitSucc,
- BasicBlock *ExitFail,
- Value *StartA, Value *EndA,
- Value *StartB, Value *EndB) {
+void LoopIdiomVectorize::transformFindFirstByte(
+ PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
+ BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd,
+ Value *NeedleStart, Value *NeedleEnd) {
// Insert the find first byte code at the end of the preheader block.
BasicBlock *Preheader = CurLoop->getLoopPreheader();
BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
@@ -1331,8 +1328,9 @@ void LoopIdiomVectorize::transformFindFirstByte(PHINode *IndPhi, unsigned VF,
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
- Value *MatchVal = expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc,
- ExitFail, StartA, EndA, StartB, EndB);
+ Value *MatchVal =
+ expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
+ SearchStart, SearchEnd, NeedleStart, NeedleEnd);
// Add new incoming values with the result of the transformation to PHINodes
// of ExitSucc that use IndPhi.
diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll
index e60553e95e13cf..a324896413d78c 100644
--- a/llvm/test/CodeGen/AArch64/find-first-byte.ll
+++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mattr=+sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize)' -S < %s | FileCheck -check-prefix=SVE2 %s
-; RUN: opt -mattr=-sve2 -mtriple=aarch64 -passes='loop(loop-idiom-vectorize)' -S < %s | FileCheck -check-prefix=NOSVE2 %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
; Base case based on `libcxx/include/__algorithm/find_first_of.h':
; char* find_first_of(char *first, char *last, char *s_first, char *s_last) {
@@ -10,11 +10,77 @@
; return last;
; }
define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
-; SVE2-LABEL: define ptr @find_first_of_i8(
-; SVE2: {{%.*}} = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> {{%.*}}, <16 x i8> {{%.*}}, <vscale x 16 x i1> {{%.*}})
-;
-; NOSVE2-LABEL: define ptr @find_first_of_i8(
-; NOSVE2-NOT: {{%.*}} = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> {{%.*}}, <16 x i8> {{%.*}}, <vscale x 16 x i1> {{%.*}})
+; CHECK-LABEL: define ptr @find_first_of_i8(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK: [[_PREHEADER:.*:]]
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[SEARCH]], i32 1, <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label %[[BB15:.*]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-NEXT: [[TMP19:%.*]] = and <vscale x 16 x i1> [[TMP8]], [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[NEEDLE]], i32 1, <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i8> [[TMP20]], i64 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP21]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = select <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> [[TMP20]], <vscale x 16 x i8> [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP22]], i64 0)
+; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> [[TMP14]], <16 x i8> [[TMP23]], <vscale x 16 x i1> [[TMP13]])
+; CHECK-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP24]])
+; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
+; CHECK: [[BB26]]:
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP24]], i1 true)
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SEARCH]], i64 [[TMP27]]
+; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]]
+; CHECK: [[TMP29]]:
+; CHECK-NEXT: [[TMP30]] = getelementptr i8, ptr [[NEEDLE]], i64 16
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
+; CHECK: [[TMP32]]:
+; CHECK-NEXT: [[TMP33]] = getelementptr i8, ptr [[SEARCH]], i64 16
+; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[BB35:.*]]
+; CHECK: [[BB35]]:
+; CHECK-NEXT: [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; CHECK-NEXT: br label %[[BB41:.*]]
+; CHECK: [[BB38:.*]]:
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP42:%.*]], i64 1
+; CHECK-NEXT: [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]]
+; CHECK: [[BB41]]:
+; CHECK-NEXT: [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ]
+; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i8 [[TMP37]], [[TMP43]]
+; CHECK-NEXT: br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]]
+; CHECK: [[TMP45]]:
+; CHECK-NEXT: [[TMP46]] = getelementptr inbounds i8, ptr [[TMP36]], i64 1
+; CHECK-NEXT: [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]]
+; CHECK: [[_LOOPEXIT:.*:]]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ]
+; CHECK-NEXT: br label %[[BB48]]
+; CHECK: [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT: br label %[[BB48]]
+; CHECK: [[BB48]]:
+; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[TMP49]]
;
%5 = icmp eq ptr %0, %1
%6 = icmp eq ptr %2, %3
@@ -48,9 +114,79 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
}
; Same as @find_first_of_i8 but with i16.
+; This is accepted and generates a similar loop.
define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
-; SVE2-LABEL: define ptr @find_first_of_i16(
-; SVE2: {{%.*}} = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> {{%.*}}, <8 x i16> {{%.*}}, <vscale x 8 x i1> {{%.*}})
+; CHECK-LABEL: define ptr @find_first_of_i16(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK: [[_PREHEADER:.*:]]
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]]
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[SEARCH]], i32 1, <vscale x 8 x i1> [[TMP13]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT: br label %[[BB15:.*]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-NEXT: [[TMP19:%.*]] = and <vscale x 8 x i1> [[TMP8]], [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[NEEDLE]], i32 1, <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <vscale x 8 x i16> [[TMP20]], i64 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP21]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i16> [[TMP20]], <vscale x 8 x i16> [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP23:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[TMP22]], i64 0)
+; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> [[TMP14]], <8 x i16> [[TMP23]], <vscale x 8 x i1> [[TMP13]])
+; CHECK-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1(<vscale x 8 x i1> [[TMP24]])
+; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]]
+; CHECK: [[BB26]]:
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP24]], i1 true)
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[SEARCH]], i64 [[TMP27]]
+; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]]
+; CHECK: [[TMP29]]:
+; CHECK-NEXT: [[TMP30]] = getelementptr i16, ptr [[NEEDLE]], i64 8
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]]
+; CHECK: [[TMP32]]:
+; CHECK-NEXT: [[TMP33]] = getelementptr i16, ptr [[SEARCH]], i64 8
+; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[BB35:.*]]
+; CHECK: [[BB35]]:
+; CHECK-NEXT: [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP36]], align 1
+; CHECK-NEXT: br label %[[BB41:.*]]
+; CHECK: [[BB38:.*]]:
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[TMP42:%.*]], i64 1
+; CHECK-NEXT: [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]]
+; CHECK: [[BB41]]:
+; CHECK-NEXT: [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ]
+; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP42]], align 1
+; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i16 [[TMP37]], [[TMP43]]
+; CHECK-NEXT: br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]]
+; CHECK: [[TMP45]]:
+; CHECK-NEXT: [[TMP46]] = getelementptr inbounds i16, ptr [[TMP36]], i64 1
+; CHECK-NEXT: [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]]
+; CHECK: [[_LOOPEXIT:.*:]]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ]
+; CHECK-NEXT: br label %[[BB48]]
+; CHECK: [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT: br label %[[BB48]]
+; CHECK: [[BB48]]:
+; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[TMP49]]
;
%5 = icmp eq ptr %0, %1
%6 = icmp eq ptr %2, %3
@@ -86,8 +222,39 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
; Same as @find_first_of_i8 but with `ne' comparison.
; This is rejected for now, but should eventually be supported.
define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
-; SVE2-LABEL: define ptr @find_first_not_of_i8(
-; SVE2-NOT: {{%.*}} = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> {{%.*}}, <16 x i8> {{%.*}}, <vscale x 16 x i1> {{%.*}})
+; CHECK-LABEL: define ptr @find_first_not_of_i8(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK: [[_PREHEADER:.*:]]
+; CHECK-NEXT: br label %[[BB8:.*]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT: br label %[[BB14:.*]]
+; CHECK: [[BB11:.*]]:
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; CHECK: [[BB14]]:
+; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]]
+; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; CHECK: [[TMP18]]:
+; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; CHECK: [[_LOOPEXIT:.*:]]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; CHECK-NEXT: br label %[[BB21]]
+; CHECK: [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT: br label %[[BB21]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[TMP22]]
;
%5 = icmp eq ptr %0, %1
%6 = icmp eq ptr %2, %3
@@ -120,4 +287,143 @@ define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
ret ptr %22
}
+; This is the same as @find_first_of_i8 but without SVE2, which we require to
+; perform the conversion.
+define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) {
+; CHECK-LABEL: define ptr @find_first_of_i8_nosve2(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK: [[_PREHEADER:.*:]]
+; CHECK-NEXT: br label %[[BB8:.*]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT: br label %[[BB14:.*]]
+; CHECK: [[BB11:.*]]:
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; CHECK: [[BB14]]:
+; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]]
+; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; CHECK: [[TMP18]]:
+; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; CHECK: [[_LOOPEXIT:.*:]]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; CHECK-NEXT: br label %[[BB21]]
+; CHECK: [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT: br label %[[BB21]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[TMP22]]
+;
+ %5 = icmp eq ptr %0, %1
+ %6 = icmp eq ptr %2, %3
+ %7 = or i1 %5, %6
+ br i1 %7, label %21, label %8
+
+8:
+ %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+ %10 = load i8, ptr %9, align 1
+ br label %14
+
+11:
+ %12 = getelementptr inbounds i8, ptr %15, i64 1
+ %13 = icmp eq ptr %12, %3
+ br i1 %13, label %18, label %14
+
+14:
+ %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+ %16 = load i8, ptr %15, align 1
+ %17 = icmp eq i8 %10, %16
+ br i1 %17, label %21, label %11
+
+18:
+ %19 = getelementptr inbounds i8, ptr %9, i64 1
+ %20 = icmp eq ptr %19, %1
+ br i1 %20, label %21, label %8
+
+21:
+ %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+ ret ptr %22
+}
+
+; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest.
+; This isn't supported.
+define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 {
+; CHECK-LABEL: define ptr @find_first_of_i8_outside_use(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK: [[_PREHEADER:.*:]]
+; CHECK-NEXT: br label %[[BB8:.*]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT: br label %[[BB14:.*]]
+; CHECK: [[BB11:.*]]:
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]]
+; CHECK: [[BB14]]:
+; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]]
+; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]]
+; CHECK: [[TMP18]]:
+; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]]
+; CHECK: [[_LOOPEXIT:.*:]]
+; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP15]], %[[BB14]] ]
+; CHECK-NEXT: br label %[[BB21]]
+; CHECK: [[_LOOPEXIT1:.*:]]
+; CHECK-NEXT: br label %[[BB21]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA3]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT: [[TMP23:%.*]] = phi ptr [ [[TMP3]], [[TMP4]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP3]], %[[DOTLOOPEXIT1]] ]
+; CHECK-NEXT: ret ptr [[TMP23]]
+;
+ %5 = icmp eq ptr %0, %1
+ %6 = icmp eq ptr %2, %3
+ %7 = or i1 %5, %6
+ br i1 %7, label %21, label %8
+
+8:
+ %9 = phi ptr [ %19, %18 ], [ %0, %4 ]
+ %10 = load i8, ptr %9, align 1
+ br label %14
+
+11:
+ %12 = getelementptr inbounds i8, ptr %15, i64 1
+ %13 = icmp eq ptr %12, %3
+ br i1 %13, label %18, label %14
+
+14:
+ %15 = phi ptr [ %2, %8 ], [ %12, %11 ]
+ %16 = load i8, ptr %15, align 1
+ %17 = icmp ne i8 %10, %16
+ br i1 %17, label %21, label %11
+
+18:
+ %19 = getelementptr inbounds i8, ptr %9, i64 1
+ %20 = icmp eq ptr %19, %1
+ br i1 %20, label %21, label %8
+
+21:
+ %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ]
+ %23 = phi ptr [ %3, %4 ], [ %15, %14 ], [ %3, %18 ]
+ ret ptr %23
+}
+
attributes #0 = { "target-features"="+sve2" }
More information about the llvm-commits
mailing list