[llvm] [RISCV][LoopIdiomVectorize] Support VP intrinsics in LoopIdiomVectorize (PR #94082)

Min-Yih Hsu via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 13 15:14:42 PDT 2024


https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/94082

>From 31c6fad204a249a81059bc460202fc9e7798b84b Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 6 Jun 2024 11:57:04 -0700
Subject: [PATCH 1/4] [LoopIdiomVectorize] Remove redundant DomTreeUpdates

Because of how we insert most of our vector code between the original
preheader and a block splitted out from it, we actually don't need most
of the DTU updates as an edge deletion update is sufficient to update
the DT of the said region.

This is effectively a NFC.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 52 ++++++-------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 38095b1433ebe..f52a32fee7401 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -40,6 +40,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -391,6 +392,14 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BasicBlock *LoopIncBlock = BasicBlock::Create(
       Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
 
+  // This is actually one of the only two DTU updates we need. The reason being
+  // that we're splitting `mismatch_end` out of the preheader and put
+  // most of the stuff we create later between the preheader and
+  // `mismatch_end`. Now when DTU removes an edge, it simply recalculates
+  // everything in between. In this case, it will be the prehedaer and
+  // `mismatch_end`, along with the aforementioned content. Therefore we don't
+  // need to insert additional DTU updates for new control flow edges
+  // added in this region.
   DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
                     {DominatorTree::Delete, Preheader, EndBlock}});
 
@@ -436,10 +445,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
       MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
   Builder.Insert(MinItCheckBr);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
-       {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
-
   // For each of the arrays, check the start/end addresses are on the same
   // page.
   Builder.SetInsertPoint(MemCheckBlock);
@@ -482,10 +487,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
                                 .createBranchWeights(10, 90));
   Builder.Insert(CombinedPageCmpCmpBr);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
-       {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}});
-
   // Set up the vector loop preheader, i.e. calculate initial loop predicate,
   // zero-extend MaxLen to 64-bits, determine the number of vector elements
   // processed in each iteration, etc.
@@ -512,9 +513,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
   Builder.Insert(JumpToVectorLoop);
 
-  DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
-                     VectorLoopStartBlock}});
-
   // Set up the first vector loop block by creating the PHIs, doing the vector
   // loads and comparing the vectors.
   Builder.SetInsertPoint(VectorLoopStartBlock);
@@ -542,10 +540,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
       VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
   Builder.Insert(VectorEarlyExit);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
-       {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
-
   // Increment the index counter and calculate the predicate for the next
   // iteration of the loop. We branch back to the start of the loop if there
   // is at least one active lane.
@@ -565,10 +559,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
       BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
   Builder.Insert(VectorLoopBranchBack);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
-       {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
-
   // If we found a mismatch then we need to calculate which lane in the vector
   // had a mismatch and add that on to the current loop index.
   Builder.SetInsertPoint(VectorLoopMismatchBlock);
@@ -592,16 +582,10 @@ Value *LoopIdiomVectorize::expandFindMismatch(
 
   Builder.Insert(BranchInst::Create(EndBlock));
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
-
   // Generate code for scalar loop.
   Builder.SetInsertPoint(LoopPreHeaderBlock);
   Builder.Insert(BranchInst::Create(LoopStartBlock));
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
-
   Builder.SetInsertPoint(LoopStartBlock);
   PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
   IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
@@ -623,9 +607,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
   Builder.Insert(MatchCmpBr);
 
-  DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
-                    {DominatorTree::Insert, LoopStartBlock, EndBlock}});
-
   // Have we reached the maximum permitted length for the loop?
   Builder.SetInsertPoint(LoopIncBlock);
   Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
@@ -636,9 +617,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
   Builder.Insert(IVCmpBr);
 
-  DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
-                    {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
-
   // In the end block we need to insert a PHI node to deal with three cases:
   //  1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
   //  2. We exitted the scalar loop early due to a mismatch and need to return
@@ -679,7 +657,12 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   BasicBlock *Header = CurLoop->getHeader();
   BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
   IRBuilder<> Builder(PHBranch);
+
+  // Safeguard to check if we build the correct DomTree with DTU.
+  auto CheckDTU = llvm::make_scope_exit(
+      [this]() { assert(DT->verify() && "Ill-formed DomTree built by DTU"); });
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
   Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
 
   // Increment the pointer if this was done before the loads in the loop.
@@ -708,6 +691,9 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
   PHBranch->eraseFromParent();
 
+  // Previously we take care of the DTU updates between the preheader and
+  // `mismatch_end`. Now we need to make sure edges and blocks appended after
+  // `mismatch_end` are also being properly accounted for.
   BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
   DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
 
@@ -717,12 +703,8 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   if (FoundBB != EndBB) {
     Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
     Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
-    DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
-                      {DominatorTree::Insert, CmpBB, EndBB}});
-
   } else {
     Builder.CreateBr(FoundBB);
-    DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
   }
 
   auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {

>From 174d02ecc00cdf7e0226e37bcf5aa8619f01f435 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 6 Jun 2024 13:20:55 -0700
Subject: [PATCH 2/4] [LoopIdiomVectorize][NFC] Factoring out the part that
 handles vectorization strategy

To pave the way for porting LIV to RISC-V, which uses VP intrinsics for
vectors.

NFC.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 219 ++++++++++--------
 1 file changed, 123 insertions(+), 96 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index f52a32fee7401..6b6b067db30b7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -79,6 +79,13 @@ class LoopIdiomVectorize {
   const TargetTransformInfo *TTI;
   const DataLayout *DL;
 
+  // Blocks that will be used for inserting vectorized code.
+  BasicBlock *EndBlock = nullptr;
+  BasicBlock *VectorLoopPreheaderBlock = nullptr;
+  BasicBlock *VectorLoopStartBlock = nullptr;
+  BasicBlock *VectorLoopMismatchBlock = nullptr;
+  BasicBlock *VectorLoopIncBlock = nullptr;
+
 public:
   explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI,
                               const TargetTransformInfo *TTI,
@@ -96,9 +103,15 @@ class LoopIdiomVectorize {
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   bool recognizeByteCompare();
+
   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             Instruction *Index, Value *Start, Value *MaxLen);
+
+  Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+                                  GetElementPtrInst *GEPB, Value *ExtStart,
+                                  Value *ExtEnd);
+
   void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
                             Value *Start, bool IncIdx, BasicBlock *FoundBB,
@@ -332,6 +345,106 @@ bool LoopIdiomVectorize::recognizeByteCompare() {
   return true;
 }
 
+Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder,
+                                                    GetElementPtrInst *GEPA,
+                                                    GetElementPtrInst *GEPB,
+                                                    Value *ExtStart,
+                                                    Value *ExtEnd) {
+  Type *I64Type = Builder.getInt64Ty();
+  Type *ResType = Builder.getInt32Ty();
+  Type *LoadType = Builder.getInt8Ty();
+  Value *PtrA = GEPA->getPointerOperand();
+  Value *PtrB = GEPB->getPointerOperand();
+
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= MinPageSize due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
+  ScalableVectorType *PredVTy =
+      ScalableVectorType::get(Builder.getInt1Ty(), 16);
+
+  Value *InitialPred = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
+
+  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
+  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
+                             /*HasNUW=*/true, /*HasNSW=*/true);
+
+  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
+                                            Builder.getInt1(false));
+
+  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+  Builder.Insert(JumpToVectorLoop);
+
+  // Set up the first vector loop block by creating the PHIs, doing the vector
+  // loads and comparing the vectors.
+  Builder.SetInsertPoint(VectorLoopStartBlock);
+  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
+  LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
+  PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
+  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+  Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
+
+  Value *VectorLhsGep =
+      Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
+  Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
+                                                  Align(1), LoopPred, Passthru);
+
+  Value *VectorRhsGep =
+      Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
+  Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
+                                                  Align(1), LoopPred, Passthru);
+
+  Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
+  VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
+  Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
+  BranchInst *VectorEarlyExit = BranchInst::Create(
+      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
+  Builder.Insert(VectorEarlyExit);
+
+  // Increment the index counter and calculate the predicate for the next
+  // iteration of the loop. We branch back to the start of the loop if there
+  // is at least one active lane.
+  Builder.SetInsertPoint(VectorLoopIncBlock);
+  Value *NewVectorIndexPhi =
+      Builder.CreateAdd(VectorIndexPhi, VecLen, "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
+  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+  Value *NewPred =
+      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                              {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
+  LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
+
+  Value *PredHasActiveLanes =
+      Builder.CreateExtractElement(NewPred, uint64_t(0));
+  BranchInst *VectorLoopBranchBack =
+      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
+  Builder.Insert(VectorLoopBranchBack);
+
+  // If we found a mismatch then we need to calculate which lane in the vector
+  // had a mismatch and add that on to the current loop index.
+  Builder.SetInsertPoint(VectorLoopMismatchBlock);
+  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
+  FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
+  PHINode *LastLoopPred =
+      Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
+  LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
+  PHINode *VectorFoundIndex =
+      Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
+  VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
+  Value *Ctz = Builder.CreateIntrinsic(
+      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
+      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
+  Ctz = Builder.CreateZExt(Ctz, I64Type);
+  Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+  return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
 Value *LoopIdiomVectorize::expandFindMismatch(
     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
     GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -346,8 +459,7 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   Type *ResType = Builder.getInt32Ty();
 
   // Split block in the original loop preheader.
-  BasicBlock *EndBlock =
-      SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
+  EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
 
   // Create the blocks that we're going to need:
   //  1. A block for checking the zero-extended length exceeds 0
@@ -371,17 +483,17 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BasicBlock *MemCheckBlock = BasicBlock::Create(
       Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopPreheaderBlock = BasicBlock::Create(
+  VectorLoopPreheaderBlock = BasicBlock::Create(
       Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopStartBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop", EndBlock->getParent(), EndBlock);
+  VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
+                                            EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopIncBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop_inc", EndBlock->getParent(), EndBlock);
+  VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
+                                          EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopMismatchBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop_found", EndBlock->getParent(), EndBlock);
+  VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
+                                               EndBlock->getParent(), EndBlock);
 
   BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
       Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
@@ -492,93 +604,8 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= MinPageSize due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
-  ScalableVectorType *PredVTy =
-      ScalableVectorType::get(Builder.getInt1Ty(), 16);
-
-  Value *InitialPred = Builder.CreateIntrinsic(
-      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
-
-  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
-  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
-                             /*HasNUW=*/true, /*HasNSW=*/true);
-
-  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
-                                            Builder.getInt1(false));
-
-  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
-  Builder.Insert(JumpToVectorLoop);
-
-  // Set up the first vector loop block by creating the PHIs, doing the vector
-  // loads and comparing the vectors.
-  Builder.SetInsertPoint(VectorLoopStartBlock);
-  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
-  LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
-  PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
-  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
-  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
-  Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
-
-  Value *VectorLhsGep =
-      Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
-  Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
-                                                  Align(1), LoopPred, Passthru);
-
-  Value *VectorRhsGep =
-      Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
-  Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
-                                                  Align(1), LoopPred, Passthru);
-
-  Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
-  VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
-  Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
-  BranchInst *VectorEarlyExit = BranchInst::Create(
-      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
-  Builder.Insert(VectorEarlyExit);
-
-  // Increment the index counter and calculate the predicate for the next
-  // iteration of the loop. We branch back to the start of the loop if there
-  // is at least one active lane.
-  Builder.SetInsertPoint(VectorLoopIncBlock);
-  Value *NewVectorIndexPhi =
-      Builder.CreateAdd(VectorIndexPhi, VecLen, "",
-                        /*HasNUW=*/true, /*HasNSW=*/true);
-  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
-  Value *NewPred =
-      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
-                              {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
-  LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
-
-  Value *PredHasActiveLanes =
-      Builder.CreateExtractElement(NewPred, uint64_t(0));
-  BranchInst *VectorLoopBranchBack =
-      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
-  Builder.Insert(VectorLoopBranchBack);
-
-  // If we found a mismatch then we need to calculate which lane in the vector
-  // had a mismatch and add that on to the current loop index.
-  Builder.SetInsertPoint(VectorLoopMismatchBlock);
-  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
-  FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
-  PHINode *LastLoopPred =
-      Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
-  LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
-  PHINode *VectorFoundIndex =
-      Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
-  VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
-
-  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
-  Value *Ctz = Builder.CreateIntrinsic(
-      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
-      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
-  Ctz = Builder.CreateZExt(Ctz, I64Type);
-  Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
-                                             /*HasNUW=*/true, /*HasNSW=*/true);
-  Value *VectorLoopRes = Builder.CreateTrunc(VectorLoopRes64, ResType);
+  Value *VectorLoopRes =
+      createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
 
   Builder.Insert(BranchInst::Create(EndBlock));
 

>From 47050e61c00ef417a906b044cdbb0d58a998efe7 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 30 May 2024 11:10:07 -0700
Subject: [PATCH 3/4] [RISCV][LoopIdiomVectorize] Support VP intrinsics in
 LoopIdiomVectorize

Teach LoopIdiomVectorize to use VP intrinsics to replace the byte
compare loops. Right now only RISC-V uses LoopIdiomVectorize of this
style.
---
 .../Transforms/Vectorize/LoopIdiomVectorize.h |   17 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   10 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.h    |    2 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |    2 +
 .../Vectorize/LoopIdiomVectorize.cpp          |  171 +-
 .../LoopIdiom/RISCV/byte-compare-index.ll     | 1751 +++++++++++++++++
 6 files changed, 1939 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
index 56f44b7dc6b2a..ef6e0e0687809 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
@@ -13,7 +13,22 @@
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
-struct LoopIdiomVectorizePass : PassInfoMixin<LoopIdiomVectorizePass> {
+enum class LoopIdiomVectorizeStyle { Masked, Predicated };
+
+class LoopIdiomVectorizePass : public PassInfoMixin<LoopIdiomVectorizePass> {
+  LoopIdiomVectorizeStyle VectorizeStyle = LoopIdiomVectorizeStyle::Masked;
+
+  // The VF used in vectorizing the byte compare pattern.
+  unsigned ByteCompareVF = 16;
+
+public:
+  LoopIdiomVectorizePass() = default;
+  explicit LoopIdiomVectorizePass(LoopIdiomVectorizeStyle S)
+      : VectorizeStyle(S) {}
+
+  LoopIdiomVectorizePass(LoopIdiomVectorizeStyle S, unsigned BCVF)
+      : VectorizeStyle(S), ByteCompareVF(BCVF) {}
+
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 35d0b3408d09f..caa04830d1c37 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -33,10 +33,12 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <optional>
 using namespace llvm;
 
@@ -573,6 +575,14 @@ void RISCVPassConfig::addPostRegAlloc() {
     addPass(createRISCVRedundantCopyEliminationPass());
 }
 
+void RISCVTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
+  PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
+                                                 OptimizationLevel Level) {
+    LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
+  });
+}
+
 yaml::MachineFunctionInfo *
 RISCVTargetMachine::createDefaultFuncInfoYAML() const {
   return new yaml::RISCVMachineFunctionInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 68dfb3c81f2fe..7111d5ec80e47 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -59,6 +59,8 @@ class RISCVTargetMachine : public LLVMTargetMachine {
                                 PerFunctionMIParsingState &PFS,
                                 SMDiagnostic &Error,
                                 SMRange &SourceRange) const override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index a4d1390875095..073779e07b513 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -397,6 +397,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   bool shouldFoldTerminatingConditionAfterLSR() const {
     return true;
   }
+
+  std::optional<unsigned> getMinPageSize() const { return 4096; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 6b6b067db30b7..fbdd36a329801 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -60,19 +60,34 @@ static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
                                 cl::init(false),
                                 cl::desc("Disable Loop Idiom Vectorize Pass."));
 
+static cl::opt<LoopIdiomVectorizeStyle>
+    LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
+                cl::desc("The vectorization style for loop idiom transform."),
+                cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked",
+                                      "Use masked vector intrinsics"),
+                           clEnumValN(LoopIdiomVectorizeStyle::Predicated,
+                                      "predicated", "Use VP intrinsics")),
+                cl::init(LoopIdiomVectorizeStyle::Masked));
+
 static cl::opt<bool>
     DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
                    cl::init(false),
                    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
                             "not convert byte-compare loop(s)."));
 
+static cl::opt<unsigned>
+    ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
+              cl::desc("The vectorization factor for byte-compare patterns."),
+              cl::init(16));
+
 static cl::opt<bool>
     VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
                 cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
 
 namespace {
-
 class LoopIdiomVectorize {
+  LoopIdiomVectorizeStyle VectorizeStyle;
+  unsigned ByteCompareVF;
   Loop *CurLoop = nullptr;
   DominatorTree *DT;
   LoopInfo *LI;
@@ -87,10 +102,11 @@ class LoopIdiomVectorize {
   BasicBlock *VectorLoopIncBlock = nullptr;
 
 public:
-  explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI,
-                              const TargetTransformInfo *TTI,
-                              const DataLayout *DL)
-      : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+  LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
+                     LoopInfo *LI, const TargetTransformInfo *TTI,
+                     const DataLayout *DL)
+      : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
+  }
 
   bool run(Loop *L);
 
@@ -111,6 +127,10 @@ class LoopIdiomVectorize {
   Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
                                   GetElementPtrInst *GEPB, Value *ExtStart,
                                   Value *ExtEnd);
+  Value *createPredicatedFindMismatch(IRBuilder<> &Builder,
+                                      GetElementPtrInst *GEPA,
+                                      GetElementPtrInst *GEPB, Value *ExtStart,
+                                      Value *ExtEnd);
 
   void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
@@ -128,8 +148,16 @@ PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
 
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
-  LoopIdiomVectorize LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
-  if (!LIT.run(&L))
+  LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
+  if (LITVecStyle.getNumOccurrences())
+    VecStyle = LITVecStyle;
+
+  unsigned BCVF = ByteCompareVF;
+  if (ByteCmpVF.getNumOccurrences())
+    BCVF = ByteCmpVF;
+
+  LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
+  if (!LIV.run(&L))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -362,14 +390,15 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder,
   // Therefore, we know that we can use a 64-bit induction variable that
   // starts from 0 -> ExtMaxLen and it will not overflow.
   ScalableVectorType *PredVTy =
-      ScalableVectorType::get(Builder.getInt1Ty(), 16);
+      ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
 
   Value *InitialPred = Builder.CreateIntrinsic(
       Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
 
   Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
-  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
-                             /*HasNUW=*/true, /*HasNSW=*/true);
+  VecLen =
+      Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
 
   Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
                                             Builder.getInt1(false));
@@ -384,7 +413,8 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder,
   LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
   PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
   VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
-  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+  Type *VectorLoadType =
+      ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
   Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
 
   Value *VectorLhsGep =
@@ -445,6 +475,112 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder,
   return Builder.CreateTrunc(VectorLoopRes64, ResType);
 }
 
+Value *LoopIdiomVectorize::createPredicatedFindMismatch(IRBuilder<> &Builder,
+                                                        GetElementPtrInst *GEPA,
+                                                        GetElementPtrInst *GEPB,
+                                                        Value *ExtStart,
+                                                        Value *ExtEnd) {
+  Type *I64Type = Builder.getInt64Ty();
+  Type *I32Type = Builder.getInt32Ty();
+  Type *ResType = I32Type;
+  Type *LoadType = Builder.getInt8Ty();
+  Value *PtrA = GEPA->getPointerOperand();
+  Value *PtrB = GEPB->getPointerOperand();
+
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= 4096 due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
+  auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+  Builder.Insert(JumpToVectorLoop);
+
+  // Set up the first Vector loop block by creating the PHIs, doing the vector
+  // loads and comparing the vectors.
+  Builder.SetInsertPoint(VectorLoopStartBlock);
+  auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
+  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+
+  // Calculate AVL by subtracting the vector loop index from the trip count
+  Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
+                                 /*HasNSW=*/true);
+
+  auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
+  auto *VF = ConstantInt::get(
+      I32Type, VectorLoadType->getElementCount().getKnownMinValue());
+  auto *IsScalable = ConstantInt::getBool(
+      Builder.getContext(), VectorLoadType->getElementCount().isScalable());
+
+  Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
+                                      {I64Type}, {AVL, VF, IsScalable});
+  Value *GepOffset = VectorIndexPhi;
+
+  Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+  if (GEPA->isInBounds())
+    cast<GetElementPtrInst>(VectorLhsGep)->setIsInBounds(true);
+  VectorType *TrueMaskTy =
+      VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
+  Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
+  Value *VectorLhsLoad = Builder.CreateIntrinsic(
+      Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+      {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
+
+  Value *VectorRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+  if (GEPB->isInBounds())
+    cast<GetElementPtrInst>(VectorRhsGep)->setIsInBounds(true);
+  Value *VectorRhsLoad = Builder.CreateIntrinsic(
+      Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+      {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
+
+  StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
+  auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
+  Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
+  Value *VectorMatchCmp = Builder.CreateIntrinsic(
+      Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
+      {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
+      "mismatch.cmp");
+  Value *CTZ = Builder.CreateIntrinsic(
+      Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
+      {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask,
+       VL});
+  // RISC-V refines/lowers the poison returned by vp.cttz.elts to -1.
+  Value *MismatchFound =
+      Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0));
+  auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
+                                             VectorLoopIncBlock, MismatchFound);
+  Builder.Insert(VectorEarlyExit);
+
+  // Increment the index counter and calculate the predicate for the next
+  // iteration of the loop. We branch back to the start of the loop if there
+  // is at least one active lane.
+  Builder.SetInsertPoint(VectorLoopIncBlock);
+  Value *VL64 = Builder.CreateZExt(VL, I64Type);
+  Value *NewVectorIndexPhi =
+      Builder.CreateAdd(VectorIndexPhi, VL64, "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
+  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+  Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
+  auto *VectorLoopBranchBack =
+      BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
+  Builder.Insert(VectorLoopBranchBack);
+
+  // If we found a mismatch then we need to calculate which lane in the vector
+  // had a mismatch and add that on to the current loop index.
+  Builder.SetInsertPoint(VectorLoopMismatchBlock);
+
+  // Add LCSSA phis for CTZ and VectorIndexPhi.
+  auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
+  CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
+  auto *VectorIndexLCSSAPhi =
+      Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
+  VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+  Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
+  Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+  return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
 Value *LoopIdiomVectorize::expandFindMismatch(
     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
     GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -604,8 +740,17 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
-  Value *VectorLoopRes =
-      createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
+  Value *VectorLoopRes = nullptr;
+  switch (VectorizeStyle) {
+  case LoopIdiomVectorizeStyle::Masked:
+    VectorLoopRes =
+        createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
+    break;
+  case LoopIdiomVectorizeStyle::Predicated:
+    VectorLoopRes =
+        createPredicatedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
+    break;
+  }
 
   Builder.Insert(BranchInst::Create(EndBlock));
 
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
new file mode 100644
index 0000000000000..845daa402606f
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -0,0 +1,1751 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -loop-idiom-vectorize-bytecmp-vf=64 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
+; RUN: opt -passes='loop(loop-idiom-vectorize),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    br label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[MISMATCH_RESULT]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_signed_wrap(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    br label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[MISMATCH_RESULT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add nsw i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM:       while.end:
+; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add nsw i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  ret i32 %inc.lcssa
+}
+
+
+define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; CHECK:       while.found:
+; CHECK-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; CHECK-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[MISMATCH_INDEX]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; LMUL8:       while.found:
+; LMUL8-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    br label [[END:%.*]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    br label [[END]]
+; LMUL8:       end:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; LMUL8-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; LMUL8-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; LMUL8-NEXT:    ret i32 [[MISMATCH_INDEX]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[BYTE_COMPARE]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       byte.compare:
+; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LOOP-DEL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]]
+; LOOP-DEL-NEXT:    [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]]
+; LOOP-DEL-NEXT:    store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
+; LOOP-DEL-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; NO-TRANSFORM:       while.found:
+; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    br label [[END:%.*]]
+; NO-TRANSFORM:       while.end:
+; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT:    br label [[END]]
+; NO-TRANSFORM:       end:
+; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; NO-TRANSFORM-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; NO-TRANSFORM-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; NO-TRANSFORM-NEXT:    ret i32 [[MISMATCH_INDEX]]
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.found
+
+while.found:
+  %mismatch_index1 = phi i32 [ %inc, %while.body ]
+  %found_ptr = phi ptr [ %c, %while.body ]
+  br label %end
+
+while.end:
+  %mismatch_index2 = phi i32 [ %n, %while.cond ]
+  %end_ptr = phi ptr [ %d, %while.cond ]
+  br label %end
+
+end:
+  %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ]
+  %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ]
+  store i32 %mismatch_index, ptr %store_ptr
+  ret i32 %mismatch_index
+}
+
+
+
+define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
+; CHECK-LABEL: define i32 @compare_bytes_extra_cmp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; CHECK-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LMUL8-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LMUL8:       ph:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    br label [[WHILE_END_LOOPEXIT]]
+; LMUL8:       while.end.loopexit:
+; LMUL8-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    br label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LOOP-DEL:       ph:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; NO-TRANSFORM:       ph:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM:       while.end:
+; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
+entry:
+  %cmp.x = icmp ult i32 %n, %x
+  br i1 %cmp.x, label %ph, label %while.end
+
+ph:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
+  ret i32 %inc.lcssa
+}
+
+define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
+; CHECK-LABEL: define void @compare_bytes_cleanup_block(
+; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT:    br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; CHECK:       cleanup.thread:
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    ret void
+;
+; LMUL8-LABEL: define void @compare_bytes_cleanup_block(
+; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; LMUL8-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; LMUL8-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; LMUL8-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; LMUL8-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; LMUL8-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; LMUL8-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; LMUL8-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; LMUL8-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; LMUL8-NEXT:    [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; LMUL8-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; LMUL8-NEXT:    br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; LMUL8-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; LMUL8-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; LMUL8-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; LMUL8-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; LMUL8-NEXT:    [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; LMUL8-NEXT:    br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; LMUL8-NEXT:    br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT:    br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; LMUL8:       cleanup.thread:
+; LMUL8-NEXT:    ret void
+; LMUL8:       if.end:
+; LMUL8-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block(
+; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]]
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]]
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]]
+; LOOP-DEL-NEXT:    [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; LOOP-DEL-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]]
+; LOOP-DEL-NEXT:    br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       common.ret:
+; LOOP-DEL-NEXT:    ret void
+;
+; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
+; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; NO-TRANSFORM:       cleanup.thread:
+; NO-TRANSFORM-NEXT:    ret void
+; NO-TRANSFORM:       if.end:
+; NO-TRANSFORM-NEXT:    [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    ret void
+entry:
+  br label %while.cond
+
+while.cond:
+  %len = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %inc = add i32 %len, 1
+  %cmp.not = icmp eq i32 %inc, 0
+  br i1 %cmp.not, label %cleanup.thread, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2, align 1
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %if.end
+
+cleanup.thread:
+  ret void
+
+if.end:
+  %res = phi i32 [ %inc, %while.body ]
+  ret void
+}
+
+;
+; NEGATIVE TESTS
+;
+
+; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI
+; with unique values for each incoming block from the loop.
+define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple2(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
+; LOOP-DEL:       while.cond:
+; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL:       while.body:
+; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+  store i32 %inc.lcssa, ptr %final_ptr
+  ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; CHECK-NEXT:    ret i32 [[FINAL_VAL]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple3(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LMUL8-NEXT:    ret i32 [[FINAL_VAL]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
+; LOOP-DEL:       while.cond:
+; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL:       while.body:
+; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LOOP-DEL-NEXT:    ret i32 [[FINAL_VAL]]
+;
+  entry:
+  br label %while.cond
+
+  while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+  while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+  while.end:
+  %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ]
+  store i32 %final_val, ptr %c
+  ret i32 %final_val
+}
+
+; Disable the optimization when noimplicitfloat is present.
+define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat {
+; CHECK-LABEL: define i32 @no_implicit_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @no_implicit_float(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @no_implicit_float(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
+; LOOP-DEL:       while.cond:
+; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL:       while.body:
+; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  ret i32 %inc.lcssa
+}

>From d90f0d870aec931c0fa575fcee3bd81aa5287a13 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 13 Jun 2024 15:14:06 -0700
Subject: [PATCH 4/4] fixup! [RISCV][LoopIdiomVectorize] Support VP intrinsics
 in LoopIdiomVectorize

---
 .../Transforms/Vectorize/LoopIdiomVectorize.cpp   | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index fbdd36a329801..7e3184a12a6b1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -384,11 +384,6 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder,
   Value *PtrA = GEPA->getPointerOperand();
   Value *PtrB = GEPB->getPointerOperand();
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= MinPageSize due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
   ScalableVectorType *PredVTy =
       ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
 
@@ -487,11 +482,6 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(IRBuilder<> &Builder,
   Value *PtrA = GEPA->getPointerOperand();
   Value *PtrB = GEPB->getPointerOperand();
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= 4096 due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
   auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
   Builder.Insert(JumpToVectorLoop);
 
@@ -740,6 +730,11 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= MinPageSize due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
   Value *VectorLoopRes = nullptr;
   switch (VectorizeStyle) {
   case LoopIdiomVectorizeStyle::Masked:



More information about the llvm-commits mailing list