[llvm] [RISCV][LoopIdiomVectorize] Support VP intrinsics in LoopIdiomVectorize (PR #94082)

Min-Yih Hsu via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 21 11:08:11 PDT 2024


https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/94082

>From c0b709210f29b7736bf990d8320efdbd95e67092 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 6 Jun 2024 13:20:55 -0700
Subject: [PATCH 1/3] [LoopIdiomVectorize][NFC] Factoring out the part that
 handles vectorization strategy

To pave the way for porting LIV to RISC-V, which uses VP intrinsics for
vectors.

NFC.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 240 ++++++++++--------
 1 file changed, 133 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 38095b1433ebe..d4a417f4be8ad 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -78,6 +78,13 @@ class LoopIdiomVectorize {
   const TargetTransformInfo *TTI;
   const DataLayout *DL;
 
+  // Blocks that will be used for inserting vectorized code.
+  BasicBlock *EndBlock = nullptr;
+  BasicBlock *VectorLoopPreheaderBlock = nullptr;
+  BasicBlock *VectorLoopStartBlock = nullptr;
+  BasicBlock *VectorLoopMismatchBlock = nullptr;
+  BasicBlock *VectorLoopIncBlock = nullptr;
+
 public:
   explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI,
                               const TargetTransformInfo *TTI,
@@ -95,9 +102,16 @@ class LoopIdiomVectorize {
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   bool recognizeByteCompare();
+
   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             Instruction *Index, Value *Start, Value *MaxLen);
+
+  Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+                                  GetElementPtrInst *GEPA,
+                                  GetElementPtrInst *GEPB, Value *ExtStart,
+                                  Value *ExtEnd);
+
   void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
                             Value *Start, bool IncIdx, BasicBlock *FoundBB,
@@ -331,6 +345,115 @@ bool LoopIdiomVectorize::recognizeByteCompare() {
   return true;
 }
 
+Value *LoopIdiomVectorize::createMaskedFindMismatch(
+    IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
+    GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
+  Type *I64Type = Builder.getInt64Ty();
+  Type *ResType = Builder.getInt32Ty();
+  Type *LoadType = Builder.getInt8Ty();
+  Value *PtrA = GEPA->getPointerOperand();
+  Value *PtrB = GEPB->getPointerOperand();
+
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= MinPageSize due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
+  ScalableVectorType *PredVTy =
+      ScalableVectorType::get(Builder.getInt1Ty(), 16);
+
+  Value *InitialPred = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
+
+  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
+  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
+                             /*HasNUW=*/true, /*HasNSW=*/true);
+
+  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
+                                            Builder.getInt1(false));
+
+  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+  Builder.Insert(JumpToVectorLoop);
+
+  DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
+                     VectorLoopStartBlock}});
+
+  // Set up the first vector loop block by creating the PHIs, doing the vector
+  // loads and comparing the vectors.
+  Builder.SetInsertPoint(VectorLoopStartBlock);
+  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
+  LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
+  PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
+  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+  Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
+
+  Value *VectorLhsGep =
+      Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
+  Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
+                                                  Align(1), LoopPred, Passthru);
+
+  Value *VectorRhsGep =
+      Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
+  Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
+                                                  Align(1), LoopPred, Passthru);
+
+  Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
+  VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
+  Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
+  BranchInst *VectorEarlyExit = BranchInst::Create(
+      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
+  Builder.Insert(VectorEarlyExit);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
+       {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
+
+  // Increment the index counter and calculate the predicate for the next
+  // iteration of the loop. We branch back to the start of the loop if there
+  // is at least one active lane.
+  Builder.SetInsertPoint(VectorLoopIncBlock);
+  Value *NewVectorIndexPhi =
+      Builder.CreateAdd(VectorIndexPhi, VecLen, "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
+  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+  Value *NewPred =
+      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                              {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
+  LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
+
+  Value *PredHasActiveLanes =
+      Builder.CreateExtractElement(NewPred, uint64_t(0));
+  BranchInst *VectorLoopBranchBack =
+      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
+  Builder.Insert(VectorLoopBranchBack);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
+       {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
+
+  // If we found a mismatch then we need to calculate which lane in the vector
+  // had a mismatch and add that on to the current loop index.
+  Builder.SetInsertPoint(VectorLoopMismatchBlock);
+  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
+  FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
+  PHINode *LastLoopPred =
+      Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
+  LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
+  PHINode *VectorFoundIndex =
+      Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
+  VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
+  Value *Ctz = Builder.CreateIntrinsic(
+      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
+      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
+  Ctz = Builder.CreateZExt(Ctz, I64Type);
+  Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+  return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
 Value *LoopIdiomVectorize::expandFindMismatch(
     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
     GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -345,8 +468,7 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   Type *ResType = Builder.getInt32Ty();
 
   // Split block in the original loop preheader.
-  BasicBlock *EndBlock =
-      SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
+  EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
 
   // Create the blocks that we're going to need:
   //  1. A block for checking the zero-extended length exceeds 0
@@ -370,17 +492,17 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BasicBlock *MemCheckBlock = BasicBlock::Create(
       Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopPreheaderBlock = BasicBlock::Create(
+  VectorLoopPreheaderBlock = BasicBlock::Create(
       Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopStartBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop", EndBlock->getParent(), EndBlock);
+  VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
+                                            EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopIncBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop_inc", EndBlock->getParent(), EndBlock);
+  VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
+                                          EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopMismatchBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop_found", EndBlock->getParent(), EndBlock);
+  VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
+                                               EndBlock->getParent(), EndBlock);
 
   BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
       Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
@@ -491,104 +613,8 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= MinPageSize due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
-  ScalableVectorType *PredVTy =
-      ScalableVectorType::get(Builder.getInt1Ty(), 16);
-
-  Value *InitialPred = Builder.CreateIntrinsic(
-      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
-
-  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
-  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
-                             /*HasNUW=*/true, /*HasNSW=*/true);
-
-  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
-                                            Builder.getInt1(false));
-
-  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
-  Builder.Insert(JumpToVectorLoop);
-
-  DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
-                     VectorLoopStartBlock}});
-
-  // Set up the first vector loop block by creating the PHIs, doing the vector
-  // loads and comparing the vectors.
-  Builder.SetInsertPoint(VectorLoopStartBlock);
-  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
-  LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
-  PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
-  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
-  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
-  Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
-
-  Value *VectorLhsGep =
-      Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
-  Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
-                                                  Align(1), LoopPred, Passthru);
-
-  Value *VectorRhsGep =
-      Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
-  Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
-                                                  Align(1), LoopPred, Passthru);
-
-  Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
-  VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
-  Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
-  BranchInst *VectorEarlyExit = BranchInst::Create(
-      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
-  Builder.Insert(VectorEarlyExit);
-
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
-       {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
-
-  // Increment the index counter and calculate the predicate for the next
-  // iteration of the loop. We branch back to the start of the loop if there
-  // is at least one active lane.
-  Builder.SetInsertPoint(VectorLoopIncBlock);
-  Value *NewVectorIndexPhi =
-      Builder.CreateAdd(VectorIndexPhi, VecLen, "",
-                        /*HasNUW=*/true, /*HasNSW=*/true);
-  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
-  Value *NewPred =
-      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
-                              {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
-  LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
-
-  Value *PredHasActiveLanes =
-      Builder.CreateExtractElement(NewPred, uint64_t(0));
-  BranchInst *VectorLoopBranchBack =
-      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
-  Builder.Insert(VectorLoopBranchBack);
-
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
-       {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
-
-  // If we found a mismatch then we need to calculate which lane in the vector
-  // had a mismatch and add that on to the current loop index.
-  Builder.SetInsertPoint(VectorLoopMismatchBlock);
-  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
-  FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
-  PHINode *LastLoopPred =
-      Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
-  LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
-  PHINode *VectorFoundIndex =
-      Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
-  VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
-
-  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
-  Value *Ctz = Builder.CreateIntrinsic(
-      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
-      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
-  Ctz = Builder.CreateZExt(Ctz, I64Type);
-  Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
-                                             /*HasNUW=*/true, /*HasNSW=*/true);
-  Value *VectorLoopRes = Builder.CreateTrunc(VectorLoopRes64, ResType);
+  Value *VectorLoopRes =
+      createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
 
   Builder.Insert(BranchInst::Create(EndBlock));
 

>From 21b0f347c85eee5eb9319bf8883503f6ee7e3e4d Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 30 May 2024 11:10:07 -0700
Subject: [PATCH 2/3] [RISCV][LoopIdiomVectorize] Support VP intrinsics in
 LoopIdiomVectorize

Teach LoopIdiomVectorize to use VP intrinsics to replace the byte
compare loops. Right now only RISC-V uses LoopIdiomVectorize of this
style.
---
 .../Transforms/Vectorize/LoopIdiomVectorize.h |   17 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   10 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.h    |    2 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |    2 +
 .../Vectorize/LoopIdiomVectorize.cpp          |  180 +-
 .../LoopIdiom/RISCV/byte-compare-index.ll     | 1751 +++++++++++++++++
 6 files changed, 1948 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
index 56f44b7dc6b2a..ef6e0e0687809 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
@@ -13,7 +13,22 @@
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
-struct LoopIdiomVectorizePass : PassInfoMixin<LoopIdiomVectorizePass> {
+enum class LoopIdiomVectorizeStyle { Masked, Predicated };
+
+class LoopIdiomVectorizePass : public PassInfoMixin<LoopIdiomVectorizePass> {
+  LoopIdiomVectorizeStyle VectorizeStyle = LoopIdiomVectorizeStyle::Masked;
+
+  // The VF used in vectorizing the byte compare pattern.
+  unsigned ByteCompareVF = 16;
+
+public:
+  LoopIdiomVectorizePass() = default;
+  explicit LoopIdiomVectorizePass(LoopIdiomVectorizeStyle S)
+      : VectorizeStyle(S) {}
+
+  LoopIdiomVectorizePass(LoopIdiomVectorizeStyle S, unsigned BCVF)
+      : VectorizeStyle(S), ByteCompareVF(BCVF) {}
+
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 35d0b3408d09f..caa04830d1c37 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -33,10 +33,12 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <optional>
 using namespace llvm;
 
@@ -573,6 +575,14 @@ void RISCVPassConfig::addPostRegAlloc() {
     addPass(createRISCVRedundantCopyEliminationPass());
 }
 
+void RISCVTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
+  PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
+                                                 OptimizationLevel Level) {
+    LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
+  });
+}
+
 yaml::MachineFunctionInfo *
 RISCVTargetMachine::createDefaultFuncInfoYAML() const {
   return new yaml::RISCVMachineFunctionInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 68dfb3c81f2fe..7111d5ec80e47 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -59,6 +59,8 @@ class RISCVTargetMachine : public LLVMTargetMachine {
                                 PerFunctionMIParsingState &PFS,
                                 SMDiagnostic &Error,
                                 SMRange &SourceRange) const override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index a4d1390875095..073779e07b513 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -397,6 +397,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   bool shouldFoldTerminatingConditionAfterLSR() const {
     return true;
   }
+
+  std::optional<unsigned> getMinPageSize() const { return 4096; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index d4a417f4be8ad..332b8498a7581 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -59,19 +59,34 @@ static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
                                 cl::init(false),
                                 cl::desc("Disable Loop Idiom Vectorize Pass."));
 
+static cl::opt<LoopIdiomVectorizeStyle>
+    LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
+                cl::desc("The vectorization style for loop idiom transform."),
+                cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked",
+                                      "Use masked vector intrinsics"),
+                           clEnumValN(LoopIdiomVectorizeStyle::Predicated,
+                                      "predicated", "Use VP intrinsics")),
+                cl::init(LoopIdiomVectorizeStyle::Masked));
+
 static cl::opt<bool>
     DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
                    cl::init(false),
                    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
                             "not convert byte-compare loop(s)."));
 
+static cl::opt<unsigned>
+    ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
+              cl::desc("The vectorization factor for byte-compare patterns."),
+              cl::init(16));
+
 static cl::opt<bool>
     VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
                 cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
 
 namespace {
-
 class LoopIdiomVectorize {
+  LoopIdiomVectorizeStyle VectorizeStyle;
+  unsigned ByteCompareVF;
   Loop *CurLoop = nullptr;
   DominatorTree *DT;
   LoopInfo *LI;
@@ -86,10 +101,11 @@ class LoopIdiomVectorize {
   BasicBlock *VectorLoopIncBlock = nullptr;
 
 public:
-  explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI,
-                              const TargetTransformInfo *TTI,
-                              const DataLayout *DL)
-      : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+  LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
+                     LoopInfo *LI, const TargetTransformInfo *TTI,
+                     const DataLayout *DL)
+      : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
+  }
 
   bool run(Loop *L);
 
@@ -111,6 +127,10 @@ class LoopIdiomVectorize {
                                   GetElementPtrInst *GEPA,
                                   GetElementPtrInst *GEPB, Value *ExtStart,
                                   Value *ExtEnd);
+  Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+                                      GetElementPtrInst *GEPA,
+                                      GetElementPtrInst *GEPB, Value *ExtStart,
+                                      Value *ExtEnd);
 
   void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
@@ -128,8 +148,16 @@ PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
 
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
-  LoopIdiomVectorize LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
-  if (!LIT.run(&L))
+  LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
+  if (LITVecStyle.getNumOccurrences())
+    VecStyle = LITVecStyle;
+
+  unsigned BCVF = ByteCompareVF;
+  if (ByteCmpVF.getNumOccurrences())
+    BCVF = ByteCmpVF;
+
+  LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
+  if (!LIV.run(&L))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -360,14 +388,15 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
   // Therefore, we know that we can use a 64-bit induction variable that
   // starts from 0 -> ExtMaxLen and it will not overflow.
   ScalableVectorType *PredVTy =
-      ScalableVectorType::get(Builder.getInt1Ty(), 16);
+      ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
 
   Value *InitialPred = Builder.CreateIntrinsic(
       Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
 
   Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
-  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
-                             /*HasNUW=*/true, /*HasNSW=*/true);
+  VecLen =
+      Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
 
   Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
                                             Builder.getInt1(false));
@@ -385,7 +414,8 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
   LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
   PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
   VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
-  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+  Type *VectorLoadType =
+      ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
   Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
 
   Value *VectorLhsGep =
@@ -454,6 +484,121 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
   return Builder.CreateTrunc(VectorLoopRes64, ResType);
 }
 
+Value *LoopIdiomVectorize::createPredicatedFindMismatch(
+    IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
+    GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
+  Type *I64Type = Builder.getInt64Ty();
+  Type *I32Type = Builder.getInt32Ty();
+  Type *ResType = I32Type;
+  Type *LoadType = Builder.getInt8Ty();
+  Value *PtrA = GEPA->getPointerOperand();
+  Value *PtrB = GEPB->getPointerOperand();
+
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= 4096 due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
+  auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+  Builder.Insert(JumpToVectorLoop);
+
+  DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
+                     VectorLoopStartBlock}});
+
+  // Set up the first Vector loop block by creating the PHIs, doing the vector
+  // loads and comparing the vectors.
+  Builder.SetInsertPoint(VectorLoopStartBlock);
+  auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
+  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+
+  // Calculate AVL by subtracting the vector loop index from the trip count
+  Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
+                                 /*HasNSW=*/true);
+
+  auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
+  auto *VF = ConstantInt::get(
+      I32Type, VectorLoadType->getElementCount().getKnownMinValue());
+  auto *IsScalable = ConstantInt::getBool(
+      Builder.getContext(), VectorLoadType->getElementCount().isScalable());
+
+  Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
+                                      {I64Type}, {AVL, VF, IsScalable});
+  Value *GepOffset = VectorIndexPhi;
+
+  Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+  if (GEPA->isInBounds())
+    cast<GetElementPtrInst>(VectorLhsGep)->setIsInBounds(true);
+  VectorType *TrueMaskTy =
+      VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
+  Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
+  Value *VectorLhsLoad = Builder.CreateIntrinsic(
+      Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+      {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
+
+  Value *VectorRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+  if (GEPB->isInBounds())
+    cast<GetElementPtrInst>(VectorRhsGep)->setIsInBounds(true);
+  Value *VectorRhsLoad = Builder.CreateIntrinsic(
+      Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+      {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
+
+  StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
+  auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
+  Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
+  Value *VectorMatchCmp = Builder.CreateIntrinsic(
+      Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
+      {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
+      "mismatch.cmp");
+  Value *CTZ = Builder.CreateIntrinsic(
+      Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
+      {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask,
+       VL});
+  // RISC-V refines/lowers the poison returned by vp.cttz.elts to -1.
+  Value *MismatchFound =
+      Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0));
+  auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
+                                             VectorLoopIncBlock, MismatchFound);
+  Builder.Insert(VectorEarlyExit);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
+       {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
+
+  // Increment the index counter and calculate the predicate for the next
+  // iteration of the loop. We branch back to the start of the loop if there
+  // is at least one active lane.
+  Builder.SetInsertPoint(VectorLoopIncBlock);
+  Value *VL64 = Builder.CreateZExt(VL, I64Type);
+  Value *NewVectorIndexPhi =
+      Builder.CreateAdd(VectorIndexPhi, VL64, "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
+  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+  Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
+  auto *VectorLoopBranchBack =
+      BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
+  Builder.Insert(VectorLoopBranchBack);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
+       {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
+
+  // If we found a mismatch then we need to calculate which lane in the vector
+  // had a mismatch and add that on to the current loop index.
+  Builder.SetInsertPoint(VectorLoopMismatchBlock);
+
+  // Add LCSSA phis for CTZ and VectorIndexPhi.
+  auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
+  CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
+  auto *VectorIndexLCSSAPhi =
+      Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
+  VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+  Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
+  Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+  return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
 Value *LoopIdiomVectorize::expandFindMismatch(
     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
     GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -613,8 +758,17 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
-  Value *VectorLoopRes =
-      createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
+  Value *VectorLoopRes = nullptr;
+  switch (VectorizeStyle) {
+  case LoopIdiomVectorizeStyle::Masked:
+    VectorLoopRes =
+        createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
+    break;
+  case LoopIdiomVectorizeStyle::Predicated:
+    VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB,
+                                                 ExtStart, ExtEnd);
+    break;
+  }
 
   Builder.Insert(BranchInst::Create(EndBlock));
 
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
new file mode 100644
index 0000000000000..845daa402606f
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -0,0 +1,1751 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -loop-idiom-vectorize-bytecmp-vf=64 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
+; RUN: opt -passes='loop(loop-idiom-vectorize),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    br label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[MISMATCH_RESULT]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_signed_wrap(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    br label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[MISMATCH_RESULT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add nsw i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM:       while.end:
+; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add nsw i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  ret i32 %inc.lcssa
+}
+
+
+define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; CHECK:       while.found:
+; CHECK-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; CHECK-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[MISMATCH_INDEX]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; LMUL8:       while.found:
+; LMUL8-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    br label [[END:%.*]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    br label [[END]]
+; LMUL8:       end:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; LMUL8-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; LMUL8-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; LMUL8-NEXT:    ret i32 [[MISMATCH_INDEX]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[BYTE_COMPARE]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       byte.compare:
+; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LOOP-DEL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]]
+; LOOP-DEL-NEXT:    [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]]
+; LOOP-DEL-NEXT:    store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
+; LOOP-DEL-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; NO-TRANSFORM:       while.found:
+; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    br label [[END:%.*]]
+; NO-TRANSFORM:       while.end:
+; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT:    br label [[END]]
+; NO-TRANSFORM:       end:
+; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; NO-TRANSFORM-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; NO-TRANSFORM-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; NO-TRANSFORM-NEXT:    ret i32 [[MISMATCH_INDEX]]
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.found
+
+while.found:
+  %mismatch_index1 = phi i32 [ %inc, %while.body ]
+  %found_ptr = phi ptr [ %c, %while.body ]
+  br label %end
+
+while.end:
+  %mismatch_index2 = phi i32 [ %n, %while.cond ]
+  %end_ptr = phi ptr [ %d, %while.cond ]
+  br label %end
+
+end:
+  %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ]
+  %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ]
+  store i32 %mismatch_index, ptr %store_ptr
+  ret i32 %mismatch_index
+}
+
+
+
+define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
+; CHECK-LABEL: define i32 @compare_bytes_extra_cmp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; CHECK-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LMUL8-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LMUL8:       ph:
+; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    br label [[WHILE_END_LOOPEXIT]]
+; LMUL8:       while.end.loopexit:
+; LMUL8-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    br label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LOOP-DEL:       ph:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL:       mismatch_mem_check:
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL:       mismatch_vec_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL:       mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_vec_loop_found:
+; LOOP-DEL-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT:    br label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_pre:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL:       mismatch_loop_inc:
+; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; NO-TRANSFORM:       ph:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM:       while.end:
+; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
+entry:
+  %cmp.x = icmp ult i32 %n, %x
+  br i1 %cmp.x, label %ph, label %while.end
+
+ph:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
+  ret i32 %inc.lcssa
+}
+
+define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
+; CHECK-LABEL: define void @compare_bytes_cleanup_block(
+; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK:       mismatch_min_it_check:
+; CHECK-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK:       mismatch_mem_check:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK:       mismatch_vec_loop_preheader:
+; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK:       mismatch_vec_loop:
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK:       mismatch_vec_loop_inc:
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK:       mismatch_vec_loop_found:
+; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; CHECK-NEXT:    br label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_pre:
+; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; CHECK:       mismatch_loop:
+; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK:       mismatch_loop_inc:
+; CHECK-NEXT:    [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK:       mismatch_end:
+; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; CHECK:       byte.compare:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT:    br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; CHECK:       cleanup.thread:
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    ret void
+;
+; LMUL8-LABEL: define void @compare_bytes_cleanup_block(
+; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8:       mismatch_min_it_check:
+; LMUL8-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8:       mismatch_mem_check:
+; LMUL8-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; LMUL8-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; LMUL8-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LMUL8-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; LMUL8-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; LMUL8-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; LMUL8-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; LMUL8-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; LMUL8-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; LMUL8-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; LMUL8-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; LMUL8-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; LMUL8-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8:       mismatch_vec_loop_preheader:
+; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8:       mismatch_vec_loop:
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8:       mismatch_vec_loop_inc:
+; LMUL8-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; LMUL8-NEXT:    [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; LMUL8-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; LMUL8-NEXT:    br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8:       mismatch_vec_loop_found:
+; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT:    [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; LMUL8-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; LMUL8-NEXT:    br label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_pre:
+; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LMUL8:       mismatch_loop:
+; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT:    [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; LMUL8-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; LMUL8-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; LMUL8-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; LMUL8-NEXT:    [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; LMUL8-NEXT:    br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8:       mismatch_loop_inc:
+; LMUL8-NEXT:    [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; LMUL8-NEXT:    br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8:       mismatch_end:
+; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; LMUL8:       byte.compare:
+; LMUL8-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT:    br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; LMUL8:       cleanup.thread:
+; LMUL8-NEXT:    ret void
+; LMUL8:       if.end:
+; LMUL8-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block(
+; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL:       mismatch_loop:
+; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]]
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]]
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]]
+; LOOP-DEL-NEXT:    [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; LOOP-DEL-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]]
+; LOOP-DEL-NEXT:    br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]]
+; LOOP-DEL:       common.ret:
+; LOOP-DEL-NEXT:    ret void
+;
+; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
+; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; NO-TRANSFORM-NEXT:  entry:
+; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM:       while.cond:
+; NO-TRANSFORM-NEXT:    [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN]], 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM:       while.body:
+; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; NO-TRANSFORM:       cleanup.thread:
+; NO-TRANSFORM-NEXT:    ret void
+; NO-TRANSFORM:       if.end:
+; NO-TRANSFORM-NEXT:    [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    ret void
+entry:
+  br label %while.cond
+
+while.cond:
+  %len = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %inc = add i32 %len, 1
+  %cmp.not = icmp eq i32 %inc, 0
+  br i1 %cmp.not, label %cleanup.thread, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2, align 1
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %if.end
+
+cleanup.thread:
+  ret void
+
+if.end:
+  %res = phi i32 [ %inc, %while.body ]
+  ret void
+}
+
+;
+; NEGATIVE TESTS
+;
+
+; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI
+; with unique values for each incoming block from the loop.
+define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple2(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
+; LOOP-DEL:       while.cond:
+; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL:       while.body:
+; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+  store i32 %inc.lcssa, ptr %final_ptr
+  ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; CHECK-NEXT:    ret i32 [[FINAL_VAL]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple3(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LMUL8-NEXT:    ret i32 [[FINAL_VAL]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
+; LOOP-DEL:       while.cond:
+; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL:       while.body:
+; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LOOP-DEL-NEXT:    ret i32 [[FINAL_VAL]]
+;
+  entry:
+  br label %while.cond
+
+  while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+  while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+  while.end:
+  %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ]
+  store i32 %final_val, ptr %c
+  ret i32 %final_val
+}
+
+; Disable the optimization when noimplicitfloat is present.
+define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat {
+; CHECK-LABEL: define i32 @no_implicit_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @no_implicit_float(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LMUL8-NEXT:  entry:
+; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
+; LMUL8:       while.cond:
+; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8:       while.body:
+; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8:       while.end:
+; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @no_implicit_float(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
+; LOOP-DEL:       while.cond:
+; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL:       while.body:
+; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL:       while.end:
+; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+  %inc = add i32 %len.addr, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+  %idxprom = zext i32 %inc to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  %0 = load i8, ptr %arrayidx
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx2
+  %cmp.not2 = icmp eq i8 %0, %1
+  br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+  ret i32 %inc.lcssa
+}

>From f507db45827f47898ab8014321393bd46d8ec6ca Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 13 Jun 2024 15:14:06 -0700
Subject: [PATCH 3/3] fixup! [RISCV][LoopIdiomVectorize] Support VP intrinsics
 in LoopIdiomVectorize

---
 .../Transforms/Vectorize/LoopIdiomVectorize.cpp   | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 332b8498a7581..16ba9ccadaafb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -382,11 +382,6 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
   Value *PtrA = GEPA->getPointerOperand();
   Value *PtrB = GEPB->getPointerOperand();
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= MinPageSize due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
   ScalableVectorType *PredVTy =
       ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
 
@@ -494,11 +489,6 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
   Value *PtrA = GEPA->getPointerOperand();
   Value *PtrB = GEPB->getPointerOperand();
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= 4096 due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
   auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
   Builder.Insert(JumpToVectorLoop);
 
@@ -758,6 +748,11 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= MinPageSize due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
   Value *VectorLoopRes = nullptr;
   switch (VectorizeStyle) {
   case LoopIdiomVectorizeStyle::Masked:



More information about the llvm-commits mailing list