[llvm] [RISCV] Introduce the RISCVLoopIdiomRecognizePass (PR #92441)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Thu May 16 11:58:59 PDT 2024
https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/92441
>From f1555792a05e03dd5e6df36e62e2f8dfa0c877b4 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 22 Apr 2024 16:09:46 -0700
Subject: [PATCH 1/2] [RISCV] Introduce the RISCVLoopIdiomRecognizePass
RISCVLoopIdiomRecognize pattern matches a specific kind of byte compare
loop that looks like:
```
while (i != max_len)
if (a[i] != b[i])
break;
... use index i ...
```
And effectively vectorizes it.
This is similar to AArch64's AArch64LoopIdiomTransform Pass, except
we're using VP intrinsic here.
This improves 557.xz_r in SPEC2017 by nearly 20% (on refrate workload)
in terms of dynamic instruction counts.
---
llvm/lib/Target/RISCV/CMakeLists.txt | 1 +
.../Target/RISCV/RISCVLoopIdiomRecognize.cpp | 752 +++++++
.../Target/RISCV/RISCVLoopIdiomRecognize.h | 25 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 20 +
llvm/lib/Target/RISCV/RISCVTargetMachine.h | 3 +
.../LoopIdiom/RISCV/byte-compare-index.ll | 1771 +++++++++++++++++
6 files changed, 2572 insertions(+)
create mode 100644 llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp
create mode 100644 llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h
create mode 100644 llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 8715403f3839a..8d9cb65940097 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(RISCVCodeGen
RISCVInstrInfo.cpp
RISCVISelDAGToDAG.cpp
RISCVISelLowering.cpp
+ RISCVLoopIdiomRecognize.cpp
RISCVMachineFunctionInfo.cpp
RISCVMergeBaseOffset.cpp
RISCVOptWInstrs.cpp
diff --git a/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp
new file mode 100644
index 0000000000000..331e0a3ea6534
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp
@@ -0,0 +1,752 @@
+//===-------- RISCVLoopIdiomRecognize.cpp - Loop idiom recognition --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVLoopIdiomRecognize.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/TargetParser/RISCVTargetParser.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-loop-idiom"
+
+static cl::opt<bool>
+ DisableAll("riscv-disable-all-loop-idiom", cl::Hidden, cl::init(true),
+ cl::desc("Disable RISCV Loop Idiom Recognize Pass."));
+
+static cl::opt<bool> DisableByteCmp(
+ "disable-riscv-loop-idiom-bytecmp", cl::Hidden, cl::init(false),
+ cl::desc("Proceed with RISCV Loop Idiom Recognize Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+// CustomLoopIdiomLMUL can be used to customize LMUL for vectorizing loops.
+// It uses the exponent value to represent LMUL i.e. 0 -> LMUL 1, 1 -> LMUL 2, 2
+// -> LMUL 4, 3 -> LMUL 8, etc.
+static cl::opt<unsigned>
+ CustomLoopIdiomLMUL("riscv-loop-idiom-lmul", cl::Hidden, cl::init(1),
+ cl::Optional,
+ cl::desc("Customize LMUL for vector loop."));
+
+namespace {
+
+class RISCVLoopIdiomRecognize {
+ Loop *CurLoop = nullptr;
+ DominatorTree &DT;
+ LoopInfo &LI;
+ TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
+ const DataLayout &DL;
+
+public:
+ explicit RISCVLoopIdiomRecognize(DominatorTree &DT, LoopInfo &LI,
+ TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL)
+ : DT(DT), LI(LI), TLI(TLI), TTI(TTI), DL(DL) {}
+
+ bool run(Loop *L);
+
+private:
+ /// \name Countable Loop Idiom Handling
+ /// @{
+
+ bool runOnCountableLoop();
+ bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ bool recognizeAndTransformByteCompare();
+ Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Instruction *Index,
+ Value *Start, Value *MaxLen);
+ void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+ Value *Start, bool IncIdx, BasicBlock *FoundBB,
+ BasicBlock *EndBB);
+
+ /// @}
+};
+} // end anonymous namespace
+
+static VectorType *getBestVectorTypeForLoopIdiom(LLVMContext &Ctx) {
+ unsigned LMULExp = std::min(3U, CustomLoopIdiomLMUL.getValue());
+ unsigned VF = (RISCV::RVVBitsPerBlock / 8) << LMULExp;
+ ElementCount EC = ElementCount::getScalable(VF);
+ return VectorType::get(Type::getInt8Ty(Ctx), EC);
+}
+
+PreservedAnalyses
+RISCVLoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (DisableAll)
+ return PreservedAnalyses::all();
+
+ Function &F = *L.getHeader()->getParent();
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName()
+ << " due to its NoImplicitFloat attribute");
+ return PreservedAnalyses::all();
+ }
+
+ // Only enabled on RV64 for now.
+ if (L.getHeader()->getModule()->getDataLayout().getPointerSizeInBits() != 64)
+ return PreservedAnalyses::all();
+
+ // Only enabled when vector extension is present.
+ if (!AR.TTI.supportsScalableVectors())
+ return PreservedAnalyses::all();
+
+ const auto DL = L.getHeader()->getModule()->getDataLayout();
+
+ RISCVLoopIdiomRecognize LIR(AR.DT, AR.LI, AR.TLI, AR.TTI, DL);
+ if (!LIR.run(&L))
+ return PreservedAnalyses::all();
+
+ auto PA = PreservedAnalyses::none();
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of RISCVLoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool RISCVLoopIdiomRecognize::run(Loop *L) {
+ CurLoop = L;
+
+ if (DisableAll)
+ return false;
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!L->getLoopPreheader())
+ return false;
+
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName()
+ << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
+
+ return recognizeAndTransformByteCompare();
+}
+
+/// Match loop-invariant value.
+template <typename SubPattern_t> struct match_LoopInvariant {
+ SubPattern_t SubPattern;
+ const Loop *L;
+
+ match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
+ : SubPattern(SP), L(L) {}
+
+ template <typename ITy> bool match(ITy *V) {
+ return L->isLoopInvariant(V) && SubPattern.match(V);
+ }
+};
+
+/// Matches if the value is loop-invariant.
+template <typename Ty>
+inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
+ return match_LoopInvariant<Ty>(M, L);
+}
+
+bool RISCVLoopIdiomRecognize::recognizeAndTransformByteCompare() {
+ if (DisableByteCmp)
+ return false;
+
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+
+ // The preheader should only contain an unconditional branch.
+ if (!PH || &PH->front() != PH->getTerminator())
+ return false;
+
+ using namespace PatternMatch;
+
+ BasicBlock *Header;
+ if (!match(PH->getTerminator(), m_UnconditionalBr(Header)))
+ return false;
+
+ if (Header != CurLoop->getHeader())
+ return false;
+
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
+ return false;
+
+ auto *PN = dyn_cast<PHINode>(&Header->front());
+ if (!PN || PN->getNumIncomingValues() != 2)
+ return false;
+
+ auto LoopBlocks = CurLoop->getBlocks();
+ // The first block in the loop should contain only 4 instructions, e.g.
+ //
+ // while.cond:
+ // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ]
+ // %inc = add i32 %res.phi, 1
+ // %cmp.not = icmp eq i32 %inc, %n
+ // br i1 %cmp.not, label %while.end, label %while.body
+ //
+ auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug();
+ if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) != 4)
+ return false;
+
+ // The second block should contain 7 instructions, e.g.
+ //
+ // while.body:
+ // %idx = zext i32 %inc to i64
+ // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx
+ // %load.a = load i8, ptr %idx.a
+ // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx
+ // %load.b = load i8, ptr %idx.b
+ // %cmp.not.ld = icmp eq i8 %load.a, %load.b
+ // br i1 %cmp.not.ld, label %while.cond, label %while.end
+ //
+ auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug();
+ if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) != 7)
+ return false;
+
+ // The incoming value to the PHI node from the loop should be an add of 1.
+ Instruction *Index = nullptr;
+ Value *StartIdx = nullptr;
+ for (BasicBlock *BB : PN->blocks()) {
+ if (!CurLoop->contains(BB)) {
+ StartIdx = PN->getIncomingValueForBlock(BB);
+ continue;
+ }
+ Index = dyn_cast<Instruction>(PN->getIncomingValueForBlock(BB));
+ // Limit to 32-bit types for now
+ if (!Index || !Index->getType()->isIntegerTy(32) ||
+ !match(Index, m_c_Add(m_Specific(PN), m_One())))
+ return false;
+ }
+
+ for (BasicBlock *BB : LoopBlocks)
+ for (Instruction &I : *BB)
+ if (&I != PN && &I != Index)
+ for (User *U : I.users()) {
+ auto UI = dyn_cast<Instruction>(U);
+ if (!CurLoop->contains(UI))
+ return false;
+ }
+
+ // Match the branch instruction for the header
+ ICmpInst::Predicate Pred;
+ Value *MaxLen;
+ BasicBlock *EndBB, *WhileBB;
+ if (!match(Header->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
+ m_BasicBlock(EndBB), m_BasicBlock(WhileBB))))
+ return false;
+
+ // Make sure Pred is comparing for equal
+ if (Pred != ICmpInst::ICMP_EQ)
+ return false;
+
+ // Make sure EndBB is outside the loop and WhileBB is inside the loop.
+ if (CurLoop->contains(EndBB) || !CurLoop->contains(WhileBB))
+ return false;
+
+ // WhileBB should contain the pattern of load & compare instructions. Match
+ // the pattern and find the GEP instructions used by the loads.
+ ICmpInst::Predicate WhilePred;
+ BasicBlock *FoundBB;
+ BasicBlock *TrueBB;
+ Value *A, *B;
+ if (!match(WhileBB->getTerminator(),
+ m_Br(m_ICmp(WhilePred, m_Load(m_Value(A)), m_Load(m_Value(B))),
+ m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))))
+ return false;
+
+ // Make sure WhilePred is comparing for equal
+ if (WhilePred != ICmpInst::ICMP_EQ)
+ return false;
+
+ // Make sure TrueBB is the loop header and FoundBB is outside the loop.
+ if (CurLoop->getHeader() != TrueBB || CurLoop->contains(FoundBB))
+ return false;
+
+ auto *GEPA = dyn_cast<GetElementPtrInst>(A);
+ auto *GEPB = dyn_cast<GetElementPtrInst>(B);
+ if (!GEPA || !GEPB)
+ return false;
+
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // Check PtrA and PtrB stride at i8.
+ if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) ||
+ !GEPA->getResultElementType()->isIntegerTy(8) ||
+ !GEPB->getResultElementType()->isIntegerTy(8) || PtrA == PtrB)
+ return false;
+
+ // Check loads from GEPA and GEPB are i8.
+ auto *LoadA = dyn_cast<LoadInst>(GEPA->getNextNode());
+ if (!LoadA || !LoadA->getType()->isIntegerTy(8))
+ return false;
+ auto *LoadB = dyn_cast<LoadInst>(GEPB->getNextNode());
+ if (!LoadB || !LoadB->getType()->isIntegerTy(8))
+ return false;
+
+ // Check that the index to the GEPs is the index we found earlier
+ if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1)
+ return false;
+
+ Value *IdxA = GEPA->getOperand(GEPA->getNumIndices());
+ Value *IdxB = GEPB->getOperand(GEPB->getNumIndices());
+
+ if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index))))
+ return false;
+
+ // We only ever expect the pre-incremented index value to be used inside the
+ // loop.
+ if (!PN->hasOneUse())
+ return false;
+
+ // Ensure that when the Found and End blocks are identical the PHIs have the
+ // supported format. We don't currently allow cases like this:
+ // while.cond:
+ // ...
+ // br i1 %cmp.not, label %while.end, label %while.body
+ //
+ // while.body:
+ // ...
+ // br i1 %cmp.not2, label %while.cond, label %while.end
+ //
+ // while.end:
+ // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+ //
+ // Where the incoming values for %final_ptr are unique and from each of the
+ // loop blocks, but not actually defined in the loop. This requires extra
+ // work setting up the byte.compare block, i.e. by introducing a select to
+ // choose the correct value.
+ // TODO: We could add support for this in future.
+ if (FoundBB == EndBB) {
+ for (PHINode &EndPN : EndBB->phis()) {
+ Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header);
+ Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB);
+
+ // The value of the index when leaving the while.cond block is always the
+ // same as the end value (MaxLen) so we permit either. Otherwise for any
+ // other value defined outside the loop we only allow values that are the
+ // same as the exit value for while.body.
+ if (WhileCondVal != WhileBodyVal &&
+ ((WhileCondVal != Index && WhileCondVal != MaxLen) ||
+ (WhileBodyVal != Index && WhileBodyVal != MaxLen)))
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
+ << *(EndBB->getParent()) << "\n\n");
+ transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, true, FoundBB,
+ EndBB);
+ LLVM_DEBUG(dbgs() << "AFTER IDIOM TRANSFORMATION: \n"
+ << *(EndBB->getParent()) << "\n\n");
+ return true;
+}
+
+Value *RISCVLoopIdiomRecognize::expandFindMismatch(
+ IRBuilder<> &Builder, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ Instruction *Index, Value *Start, Value *MaxLen) {
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // Get the arguments and types for the intrinsic.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ auto *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ LLVMContext &Ctx = PHBranch->getContext();
+ Type *LoadType = Type::getInt8Ty(Ctx);
+ Type *ResType = Builder.getInt32Ty();
+
+ // Split block at the original callsite, where the EndBlock continues from
+ // where the original call ended.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ BasicBlock *EndBlock =
+ SplitBlock(Preheader, PHBranch, &DT, &LI, nullptr, "mismatch_end");
+
+ // Safeguard to check if we build the correct DomTree with DTU.
+ auto CheckDTU = llvm::make_scope_exit([&]() {
+ assert(DTU.getDomTree().verify() && "Ill-formed DomTree built by DTU");
+ });
+
+ // Create the blocks that we're going to need:
+ // 1. A block for checking the zero-extended length exceeds 0
+ // 2. A block to check that the start and end addresses of a given array
+ // lie on the same page.
+ // 3. The RVV loop preheader i.e. vector_loop_preheader
+ // 4. The first RVV loop block i.e. vector_loop
+ // 5. The RVV loop increment block i.e. vector_loop_inc
+ // 6. A block we can jump to from the RVV loop when a mismatch is found i.e.
+ // vector_loop_exit
+ // 7. The first block of the scalar loop itself, containing PHIs , loads
+ // and cmp.
+ // 8. A scalar loop increment block to increment the PHIs and go back
+ // around the loop.
+
+ BasicBlock *MinItCheckBlock = BasicBlock::Create(
+ Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock);
+
+ // This DTU update is actually the only one we need to cover all control flow
+ // changes made in this function. Because the current DTU algorithm
+ // recaculates the whole sub-tree between a deleted edge. And the edge between
+ // Preheader and EndBlock happens to enclose all the blocks we inserted
+ // in this function.
+ DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
+ {DominatorTree::Delete, Preheader, EndBlock}});
+
+ // Update the terminator added by SplitBlock to branch to the first block
+ Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock);
+
+ BasicBlock *MemCheckBlock = BasicBlock::Create(
+ Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *RVVLoopPreheaderBlock = BasicBlock::Create(
+ Ctx, "mismatch_vector_loop_preheader", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *RVVLoopStartBlock = BasicBlock::Create(
+ Ctx, "mismatch_vector_loop", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *RVVLoopIncBlock = BasicBlock::Create(
+ Ctx, "mismatch_vector_loop_inc", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *RVVLoopMismatchBlock = BasicBlock::Create(
+ Ctx, "mismatch_vector_loop_found", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
+ Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *LoopStartBlock =
+ BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *LoopIncBlock = BasicBlock::Create(
+ Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
+
+ // Update LoopInfo with the new RVV & scalar loops.
+ auto RVVLoop = LI.AllocateLoop();
+ auto ScalarLoop = LI.AllocateLoop();
+ if (CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->addChildLoop(RVVLoop);
+ CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
+
+ CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(RVVLoopPreheaderBlock, LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(RVVLoopMismatchBlock, LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, LI);
+ } else {
+ LI.addTopLevelLoop(RVVLoop);
+ LI.addTopLevelLoop(ScalarLoop);
+ }
+
+ // Add the new basic blocks to their associated loops.
+ RVVLoop->addBasicBlockToLoop(RVVLoopStartBlock, LI);
+ RVVLoop->addBasicBlockToLoop(RVVLoopIncBlock, LI);
+
+ ScalarLoop->addBasicBlockToLoop(LoopStartBlock, LI);
+ ScalarLoop->addBasicBlockToLoop(LoopIncBlock, LI);
+
+ // Set up some types and constants that we intend to reuse.
+ Type *I64Type = Builder.getInt64Ty();
+ Type *I32Type = Builder.getInt32Ty();
+
+ // Check the zero-extended iteration count > 0
+ Builder.SetInsertPoint(MinItCheckBlock);
+ Value *ExtStart = Builder.CreateZExt(Start, I64Type);
+ Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type);
+ // This check doesn't really cost us very much.
+
+ Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
+ BranchInst *MinItCheckBr =
+ BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
+ MinItCheckBr->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
+ Builder.Insert(MinItCheckBr);
+
+ // For each of the arrays, check the start/end addresses are on the same
+ // page.
+ Builder.SetInsertPoint(MemCheckBlock);
+
+ // For each start address calculate the offset into the min architecturally
+ // allowed page size (4096). Then determine how many bytes there are left on
+ // the page and see if this is >= MaxLen.
+ Value *LhsStartPage = Builder.CreateLShr(
+ Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrA, ExtStart),
+ I64Type),
+ uint64_t(12));
+ Value *LhsEndPage = Builder.CreateLShr(
+ Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrA, ExtEnd),
+ I64Type),
+ uint64_t(12));
+ Value *RhsStartPage = Builder.CreateLShr(
+ Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrB, ExtStart),
+ I64Type),
+ uint64_t(12));
+ Value *RhsEndPage = Builder.CreateLShr(
+ Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrB, ExtEnd),
+ I64Type),
+ uint64_t(12));
+ Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage);
+ Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
+
+ BranchInst *CombinedPageCmpCmpBr =
+ BranchInst::Create(LoopPreHeaderBlock, RVVLoopPreheaderBlock,
+ Builder.CreateOr(LhsPageCmp, RhsPageCmp));
+ CombinedPageCmpCmpBr->setMetadata(
+ LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
+ .createBranchWeights(10, 90));
+ Builder.Insert(CombinedPageCmpCmpBr);
+
+ // Set up the RVV loop preheader, i.e. calculate initial loop predicate,
+ // zero-extend MaxLen to 64-bits, determine the number of vector elements
+ // processed in each iteration, etc.
+ Builder.SetInsertPoint(RVVLoopPreheaderBlock);
+
+ // At this point we know two things must be true:
+ // 1. Start <= End
+ // 2. ExtMaxLen <= 4096 due to the page checks.
+ // Therefore, we know that we can use a 64-bit induction variable that
+ // starts from 0 -> ExtMaxLen and it will not overflow.
+ auto *JumpToRVVLoop = BranchInst::Create(RVVLoopStartBlock);
+ Builder.Insert(JumpToRVVLoop);
+
+ // Set up the first RVV loop block by creating the PHIs, doing the vector
+ // loads and comparing the vectors.
+ Builder.SetInsertPoint(RVVLoopStartBlock);
+ auto *RVVIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
+ RVVIndexPhi->addIncoming(ExtStart, RVVLoopPreheaderBlock);
+
+ // Calculate AVL by subtracting the vector loop index from the trip count
+ Value *AVL = Builder.CreateSub(ExtEnd, RVVIndexPhi, "avl", /*HasNUW=*/true,
+ /*HasNSW=*/true);
+
+ VectorType *RVVLoadType = getBestVectorTypeForLoopIdiom(Builder.getContext());
+ auto *VF = ConstantInt::get(
+ I32Type, RVVLoadType->getElementCount().getKnownMinValue());
+ auto *IsScalable = ConstantInt::getBool(
+ Builder.getContext(), RVVLoadType->getElementCount().isScalable());
+
+ Value *RVL =
+ Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
+ {I64Type}, {AVL, VF, IsScalable});
+ Value *GepOffset = RVVIndexPhi;
+
+ Value *RVVLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(RVVLhsGep)->setIsInBounds(true);
+ VectorType *TrueMaskTy =
+ VectorType::get(Builder.getInt1Ty(), RVVLoadType->getElementCount());
+ Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
+ Value *RVVLhsLoad = Builder.CreateIntrinsic(
+ Intrinsic::vp_load, {RVVLoadType, RVVLhsGep->getType()},
+ {RVVLhsGep, AllTrueMask, RVL}, nullptr, "lhs.load");
+
+ Value *RVVRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(RVVRhsGep)->setIsInBounds(true);
+ Value *RVVRhsLoad = Builder.CreateIntrinsic(
+ Intrinsic::vp_load, {RVVLoadType, RVVLhsGep->getType()},
+ {RVVRhsGep, AllTrueMask, RVL}, nullptr, "rhs.load");
+
+ StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
+ auto *PredicateMDS = MDString::get(RVVLhsLoad->getContext(), PredicateStr);
+ Value *Pred = MetadataAsValue::get(RVVLhsLoad->getContext(), PredicateMDS);
+ Value *RVVMatchCmp =
+ Builder.CreateIntrinsic(Intrinsic::vp_icmp, {RVVLhsLoad->getType()},
+ {RVVLhsLoad, RVVRhsLoad, Pred, AllTrueMask, RVL},
+ nullptr, "mismatch.cmp");
+ Value *CTZ = Builder.CreateIntrinsic(
+ Intrinsic::vp_cttz_elts, {ResType, RVVMatchCmp->getType()},
+ {RVVMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask, RVL});
+ // RISC-V refines/lowers the poison returned by cttz.elts to -1.
+ Value *MismatchFound =
+ Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0));
+ auto *RVVEarlyExit =
+ BranchInst::Create(RVVLoopMismatchBlock, RVVLoopIncBlock, MismatchFound);
+ Builder.Insert(RVVEarlyExit);
+
+ // Increment the index counter and calculate the predicate for the next
+ // iteration of the loop. We branch back to the start of the loop if there
+ // is at least one active lane.
+ Builder.SetInsertPoint(RVVLoopIncBlock);
+ Value *RVL64 = Builder.CreateZExt(RVL, I64Type);
+ Value *NewRVVIndexPhi = Builder.CreateAdd(RVVIndexPhi, RVL64, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ RVVIndexPhi->addIncoming(NewRVVIndexPhi, RVVLoopIncBlock);
+ Value *ExitCond = Builder.CreateICmpNE(NewRVVIndexPhi, ExtEnd);
+ auto *RVVLoopBranchBack =
+ BranchInst::Create(RVVLoopStartBlock, EndBlock, ExitCond);
+ Builder.Insert(RVVLoopBranchBack);
+
+ // If we found a mismatch then we need to calculate which lane in the vector
+ // had a mismatch and add that on to the current loop index.
+ Builder.SetInsertPoint(RVVLoopMismatchBlock);
+
+ // Add LCSSA phis for CTZ and RVVIndexPhi.
+ auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
+ CTZLCSSAPhi->addIncoming(CTZ, RVVLoopStartBlock);
+ auto *RVVIndexLCSSAPhi =
+ Builder.CreatePHI(RVVIndexPhi->getType(), 1, "mismatch_vector_index");
+ RVVIndexLCSSAPhi->addIncoming(RVVIndexPhi, RVVLoopStartBlock);
+
+ Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
+ Value *RVVLoopRes64 = Builder.CreateAdd(RVVIndexLCSSAPhi, CTZI64, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ Value *RVVLoopRes = Builder.CreateTrunc(RVVLoopRes64, ResType);
+
+ Builder.Insert(BranchInst::Create(EndBlock));
+
+ // Generate code for scalar loop.
+ Builder.SetInsertPoint(LoopPreHeaderBlock);
+ auto *StartIndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_start_index");
+ StartIndexPhi->addIncoming(Start, MemCheckBlock);
+ StartIndexPhi->addIncoming(Start, MinItCheckBlock);
+ Builder.Insert(BranchInst::Create(LoopStartBlock));
+
+ Builder.SetInsertPoint(LoopStartBlock);
+ auto *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
+ IndexPhi->addIncoming(StartIndexPhi, LoopPreHeaderBlock);
+
+ // Otherwise compare the values
+ // Load bytes from each array and compare them.
+ GepOffset = Builder.CreateZExt(IndexPhi, I64Type);
+
+ Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(LhsGep)->setIsInBounds(true);
+ Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep);
+
+ Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(RhsGep)->setIsInBounds(true);
+ Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep);
+
+ Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
+ // If we have a mismatch then exit the loop ...
+ auto *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
+ Builder.Insert(MatchCmpBr);
+ // Have we reached the maximum permitted length for the loop?
+ Builder.SetInsertPoint(LoopIncBlock);
+ Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
+ /*HasNUW=*/Index->hasNoUnsignedWrap(),
+ /*HasNSW=*/Index->hasNoSignedWrap());
+ IndexPhi->addIncoming(PhiInc, LoopIncBlock);
+ Value *IVCmp = Builder.CreateICmpEQ(IndexPhi, MaxLen);
+ auto *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
+ Builder.Insert(IVCmpBr);
+
+ // In the end block we need to insert a PHI node to deal with three cases:
+ // 1. The length of the loop was zero, hence we jumped straight from
+ // MinItCheckBlock.
+ // 2. We didn't find a mismatch in the scalar loop, so we should return
+ // MaxLen.
+ // 3. We exitted the scalar loop early due to a mismatch and need to return
+ // the index that we found.
+ // 4. We didn't find a mismatch in the RVV loop, so we should return
+ // MaxLen.
+ // 5. We exitted the RVV loop early due to a mismatch and need to return
+ // the index that we found.
+ Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
+ auto *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
+ ResPhi->addIncoming(MaxLen, LoopIncBlock);
+ ResPhi->addIncoming(IndexPhi, LoopStartBlock);
+ ResPhi->addIncoming(MaxLen, RVVLoopIncBlock);
+ ResPhi->addIncoming(RVVLoopRes, RVVLoopMismatchBlock);
+
+ return Builder.CreateTrunc(ResPhi, ResType);
+}
+
+void RISCVLoopIdiomRecognize::transformByteCompare(
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi,
+ Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx,
+ BasicBlock *FoundBB, BasicBlock *EndBB) {
+
+ // Insert the byte compare intrinsic at the end of the preheader block
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BasicBlock *Header = CurLoop->getHeader();
+ auto *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ IRBuilder<> Builder(PHBranch);
+ Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+ // Increment the pointer if this was done before the loads in the loop.
+ if (IncIdx)
+ Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
+
+ Value *ByteCmpRes =
+ expandFindMismatch(Builder, GEPA, GEPB, Index, Start, MaxLen);
+
+ // Replaces uses of index with intrinsic.
+ assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
+ Index->replaceAllUsesWith(ByteCmpRes);
+
+ // If no mismatch was found, we can jump to the end block. Create a
+ // new basic block for the compare instruction.
+ auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
+ Preheader->getParent());
+ CmpBB->moveBefore(EndBB);
+
+ // Replace the branch in the preheader with an always-true conditional branch.
+ // This ensures there is still a reference to the original loop.
+ Value *BrCnd = Builder.CreateICmpEQ(ConstantInt::get(Start->getType(), 1),
+ ConstantInt::get(Start->getType(), 1));
+ Builder.CreateCondBr(BrCnd, CmpBB, Header);
+ PHBranch->eraseFromParent();
+
+ // Create the branch to either the end or found block depending on the value
+ // returned by the intrinsic.
+ Builder.SetInsertPoint(CmpBB);
+ Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
+ Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
+
+ auto FixSuccessorPhis = [&](BasicBlock *SuccBB) {
+ for (PHINode &PN : SuccBB->phis()) {
+ // At this point we've already replaced all uses of the result from the
+ // loop with ByteCmp. Look through the incoming values to find ByteCmp,
+ // meaning this is a Phi collecting the results of the byte compare.
+ bool ResPhi =
+ any_of(PN.incoming_values(), [=](Value *Op) { return Op == CmpBB; });
+
+ // If any of the incoming values were ByteCmp, we need to also add
+ // it as an incoming value from CmpBB.
+ if (ResPhi) {
+ PN.addIncoming(ByteCmpRes, CmpBB);
+ } else {
+ // Otherwise, this is a Phi for different values. We should create
+ // a new incoming value from CmpBB matching the same value as from
+ // the old loop.
+ for (BasicBlock *BB : PN.blocks())
+ if (CurLoop->contains(BB)) {
+ PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
+ break;
+ }
+ }
+ }
+ };
+
+ // Ensure all Phis in the successors of CmpBB have an incoming value from it.
+ FixSuccessorPhis(EndBB);
+ FixSuccessorPhis(FoundBB);
+
+ // The new CmpBB block isn't part of the loop, but will need to be added to
+ // the outer loop if there is one.
+ if (!CurLoop->isOutermost())
+ CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, LI);
+
+ // Update the dominator tree with the new block.
+ DT.addNewBlock(CmpBB, Preheader);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h
new file mode 100644
index 0000000000000..7906936b934b9
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h
@@ -0,0 +1,25 @@
+//===-------- RISCVLoopIdiomRecognize.h -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVLOOPIDIOMRECOGNIZE_H
+#define LLVM_LIB_TARGET_RISCV_RISCVLOOPIDIOMRECOGNIZE_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+struct RISCVLoopIdiomRecognizePass
+ : public PassInfoMixin<RISCVLoopIdiomRecognizePass> {
+ RISCVLoopIdiomRecognizePass() = default;
+
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_RISCV_RISCVLOOPIDIOMRECOGNIZE_H
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 5d598a275a008..3c06e62093bb7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -13,6 +13,7 @@
#include "RISCVTargetMachine.h"
#include "MCTargetDesc/RISCVBaseInfo.h"
#include "RISCV.h"
+#include "RISCVLoopIdiomRecognize.h"
#include "RISCVMachineFunctionInfo.h"
#include "RISCVTargetObjectFile.h"
#include "RISCVTargetTransformInfo.h"
@@ -33,6 +34,7 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
@@ -584,3 +586,21 @@ bool RISCVTargetMachine::parseMachineFunctionInfo(
PFS.MF.getInfo<RISCVMachineFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
return false;
}
+
+void RISCVTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
+ PB.registerPipelineParsingCallback(
+ [](StringRef PassName, LoopPassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "riscv-loop-idiom") {
+ PM.addPass(RISCVLoopIdiomRecognizePass());
+ return true;
+ }
+ return false;
+ });
+
+ PB.registerLateLoopOptimizationsEPCallback(
+ [=](LoopPassManager &LPM, OptimizationLevel Level) {
+ LPM.addPass(RISCVLoopIdiomRecognizePass());
+ });
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 68dfb3c81f2fe..1f8ccc76987c7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -59,6 +59,9 @@ class RISCVTargetMachine : public LLVMTargetMachine {
PerFunctionMIParsingState &PFS,
SMDiagnostic &Error,
SMRange &SourceRange) const override;
+
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
new file mode 100644
index 0000000000000..047ed61119111
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -0,0 +1,1771 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -riscv-disable-all-loop-idiom=false -passes=riscv-loop-idiom -mtriple=riscv64-unknown-linux-gnu -mattr=+v -S < %s | FileCheck %s
+; RUN: opt -riscv-disable-all-loop-idiom=false -passes=riscv-loop-idiom -mtriple=riscv64-unknown-linux-gnu -riscv-loop-idiom-lmul=3 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
+; RUN: opt -riscv-disable-all-loop-idiom=false -passes='loop(riscv-loop-idiom),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: mismatch_vector_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vector_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vector_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vector_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; LMUL8: mismatch_vector_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vector_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vector_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vector_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]]
+; LOOP-DEL: mismatch_vector_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vector_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vector_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_signed_wrap(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vector_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vector_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vector_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vector_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vector_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vector_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vector_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vector_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vector_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vector_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vector_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]]
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add nsw i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
+
+
+define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vector_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vector_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vector_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vector_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX3]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; CHECK: while.found:
+; CHECK-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[END:%.*]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; CHECK: while.end:
+; CHECK-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[END]]
+; CHECK: end:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; CHECK-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; CHECK-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; CHECK-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vector_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vector_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vector_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vector_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX3]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; LMUL8: while.found:
+; LMUL8-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[END:%.*]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[END]]
+; LMUL8: end:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; LMUL8-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; LMUL8-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; LMUL8-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vector_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vector_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]]
+; LOOP-DEL: mismatch_vector_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX3]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: byte.compare:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]]
+; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]]
+; LOOP-DEL-NEXT: store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
+; LOOP-DEL-NEXT: ret i32 [[SPEC_SELECT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; NO-TRANSFORM: while.found:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: br label [[END:%.*]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: br label [[END]]
+; NO-TRANSFORM: end:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; NO-TRANSFORM-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; NO-TRANSFORM-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; NO-TRANSFORM-NEXT: ret i32 [[MISMATCH_INDEX]]
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.found
+
+while.found:
+ %mismatch_index1 = phi i32 [ %inc, %while.body ]
+ %found_ptr = phi ptr [ %c, %while.body ]
+ br label %end
+
+while.end:
+ %mismatch_index2 = phi i32 [ %n, %while.cond ]
+ %end_ptr = phi ptr [ %d, %while.cond ]
+ br label %end
+
+end:
+ %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ]
+ %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ]
+ store i32 %mismatch_index, ptr %store_ptr
+ ret i32 %mismatch_index
+}
+
+
+
+define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
+; CHECK-LABEL: define i32 @compare_bytes_extra_cmp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; CHECK-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK: ph:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vector_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vector_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vector_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vector_loop_found:
+; CHECK-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_END_LOOPEXIT]]
+; CHECK: while.end.loopexit:
+; CHECK-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LMUL8-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LMUL8: ph:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vector_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vector_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vector_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vector_loop_found:
+; LMUL8-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ]
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_END_LOOPEXIT]]
+; LMUL8: while.end.loopexit:
+; LMUL8-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LOOP-DEL-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LOOP-DEL: ph:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vector_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vector_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_vector_loop_found:
+; LOOP-DEL-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[PH]] ]
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; NO-TRANSFORM: ph:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]]
+entry:
+ %cmp.x = icmp ult i32 %n, %x
+ br i1 %cmp.x, label %ph, label %while.end
+
+ph:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
+ ret i32 %inc.lcssa
+}
+
+define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
+; CHECK-LABEL: define void @compare_bytes_cleanup_block(
+; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 12
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP9]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vector_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vector_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vector_loop_inc:
+; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vector_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_MEM_CHECK]] ], [ 1, [[MISMATCH_MIN_IT_CHECK]] ]
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; CHECK-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0
+; CHECK-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; CHECK: cleanup.thread:
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret void
+;
+; LMUL8-LABEL: define void @compare_bytes_cleanup_block(
+; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; LMUL8-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 12
+; LMUL8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; LMUL8-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; LMUL8-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; LMUL8-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; LMUL8-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP9]] to i64
+; LMUL8-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; LMUL8-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; LMUL8-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; LMUL8-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vector_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vector_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vector_loop_inc:
+; LMUL8-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; LMUL8-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; LMUL8-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; LMUL8-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vector_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; LMUL8-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_MEM_CHECK]] ], [ 1, [[MISMATCH_MIN_IT_CHECK]] ]
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; LMUL8-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; LMUL8-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; LMUL8-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; LMUL8-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; LMUL8-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP32:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0
+; LMUL8-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; LMUL8: cleanup.thread:
+; LMUL8-NEXT: ret void
+; LMUL8: if.end:
+; LMUL8-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret void
+;
+; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block(
+; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]]
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]]
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]]
+; LOOP-DEL-NEXT: [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0
+; LOOP-DEL-NEXT: [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]]
+; LOOP-DEL-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: common.ret:
+; LOOP-DEL-NEXT: ret void
+;
+; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
+; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; NO-TRANSFORM: cleanup.thread:
+; NO-TRANSFORM-NEXT: ret void
+; NO-TRANSFORM: if.end:
+; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: ret void
+entry:
+ br label %while.cond
+
+while.cond:
+ %len = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %inc = add i32 %len, 1
+ %cmp.not = icmp eq i32 %inc, 0
+ br i1 %cmp.not, label %cleanup.thread, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2, align 1
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %if.end
+
+cleanup.thread:
+ ret void
+
+if.end:
+ %res = phi i32 [ %inc, %while.body ]
+ ret void
+}
+
+;
+; NEGATIVE TESTS
+;
+
+; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI
+; with unique values for each incoming block from the loop.
+define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple2(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LMUL8-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+ store i32 %inc.lcssa, ptr %final_ptr
+ ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; CHECK-NEXT: ret i32 [[FINAL_VAL]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple3(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LMUL8-NEXT: ret i32 [[FINAL_VAL]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LOOP-DEL-NEXT: ret i32 [[FINAL_VAL]]
+;
+ entry:
+ br label %while.cond
+
+ while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+ while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+ while.end:
+ %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ]
+ store i32 %final_val, ptr %c
+ ret i32 %final_val
+}
+
+; Disable the optimization when noimplicitfloat is present.
+define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat {
+; CHECK-LABEL: define i32 @no_implicit_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @no_implicit_float(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @no_implicit_float(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
>From 8d285d7aa8f4d2bf03a8cc506fa7662b9c897afb Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 16 May 2024 11:58:43 -0700
Subject: [PATCH 2/2] fixup! [RISCV] Introduce the RISCVLoopIdiomRecognizePass
---
llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h
index 7906936b934b9..b31f16817a8b9 100644
--- a/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h
+++ b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h
@@ -1,4 +1,4 @@
-//===-------- RISCVLoopIdiomRecognize.h -----------------------------------===//
+//===-------- RISCVLoopIdiomRecognize.h -------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
More information about the llvm-commits
mailing list