[llvm] [RISCV][LoopIdiomVectorize] Support VP intrinsics in LoopIdiomVectorize (PR #94082)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 2 18:44:26 PDT 2024
https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/94082
>From 1e1581355d637b239cc70448a68c8d92c1ec105f Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 30 May 2024 11:10:07 -0700
Subject: [PATCH 1/6] [RISCV][LoopIdiomVectorize] Support VP intrinsics in
LoopIdiomVectorize
Teach LoopIdiomVectorize to use VP intrinsics to replace the byte
compare loops. Right now only RISC-V uses LoopIdiomVectorize of this
style.
---
.../Transforms/Vectorize/LoopIdiomVectorize.h | 17 +-
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 10 +
llvm/lib/Target/RISCV/RISCVTargetMachine.h | 2 +
.../Target/RISCV/RISCVTargetTransformInfo.h | 2 +
.../Vectorize/LoopIdiomVectorize.cpp | 180 +-
.../LoopIdiom/RISCV/byte-compare-index.ll | 1751 +++++++++++++++++
6 files changed, 1948 insertions(+), 14 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
index 56f44b7dc6b2a..ef6e0e0687809 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h
@@ -13,7 +13,22 @@
#include "llvm/Transforms/Scalar/LoopPassManager.h"
namespace llvm {
-struct LoopIdiomVectorizePass : PassInfoMixin<LoopIdiomVectorizePass> {
+enum class LoopIdiomVectorizeStyle { Masked, Predicated };
+
+class LoopIdiomVectorizePass : public PassInfoMixin<LoopIdiomVectorizePass> {
+ LoopIdiomVectorizeStyle VectorizeStyle = LoopIdiomVectorizeStyle::Masked;
+
+ // The VF used in vectorizing the byte compare pattern.
+ unsigned ByteCompareVF = 16;
+
+public:
+ LoopIdiomVectorizePass() = default;
+ explicit LoopIdiomVectorizePass(LoopIdiomVectorizeStyle S)
+ : VectorizeStyle(S) {}
+
+ LoopIdiomVectorizePass(LoopIdiomVectorizeStyle S, unsigned BCVF)
+ : VectorizeStyle(S), ByteCompareVF(BCVF) {}
+
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
};
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index f76aef742290c..50f4920d74799 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -33,10 +33,12 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
#include <optional>
using namespace llvm;
@@ -572,6 +574,14 @@ void RISCVPassConfig::addPostRegAlloc() {
addPass(createRISCVRedundantCopyEliminationPass());
}
+void RISCVTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
+ PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
+ OptimizationLevel Level) {
+ LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
+ });
+}
+
yaml::MachineFunctionInfo *
RISCVTargetMachine::createDefaultFuncInfoYAML() const {
return new yaml::RISCVMachineFunctionInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 68dfb3c81f2fe..7111d5ec80e47 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -59,6 +59,8 @@ class RISCVTargetMachine : public LLVMTargetMachine {
PerFunctionMIParsingState &PFS,
SMDiagnostic &Error,
SMRange &SourceRange) const override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index c4d10aada1f4c..9c37a4f6ec2d0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -397,6 +397,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
bool shouldFoldTerminatingConditionAfterLSR() const {
return true;
}
+
+ std::optional<unsigned> getMinPageSize() const { return 4096; }
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index c7a8700e14531..58595007f55e6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -59,19 +59,34 @@ static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
cl::init(false),
cl::desc("Disable Loop Idiom Vectorize Pass."));
+static cl::opt<LoopIdiomVectorizeStyle>
+ LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
+ cl::desc("The vectorization style for loop idiom transform."),
+ cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked",
+ "Use masked vector intrinsics"),
+ clEnumValN(LoopIdiomVectorizeStyle::Predicated,
+ "predicated", "Use VP intrinsics")),
+ cl::init(LoopIdiomVectorizeStyle::Masked));
+
static cl::opt<bool>
DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
cl::init(false),
cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
"not convert byte-compare loop(s)."));
+static cl::opt<unsigned>
+ ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
+ cl::desc("The vectorization factor for byte-compare patterns."),
+ cl::init(16));
+
static cl::opt<bool>
VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
namespace {
-
class LoopIdiomVectorize {
+ LoopIdiomVectorizeStyle VectorizeStyle;
+ unsigned ByteCompareVF;
Loop *CurLoop = nullptr;
DominatorTree *DT;
LoopInfo *LI;
@@ -86,10 +101,11 @@ class LoopIdiomVectorize {
BasicBlock *VectorLoopIncBlock = nullptr;
public:
- explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI,
- const TargetTransformInfo *TTI,
- const DataLayout *DL)
- : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+ LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
+ LoopInfo *LI, const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+ : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
+ }
bool run(Loop *L);
@@ -111,6 +127,10 @@ class LoopIdiomVectorize {
GetElementPtrInst *GEPA,
GetElementPtrInst *GEPB, Value *ExtStart,
Value *ExtEnd);
+ Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+ GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Value *ExtStart,
+ Value *ExtEnd);
void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
@@ -128,8 +148,16 @@ PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
const auto *DL = &L.getHeader()->getDataLayout();
- LoopIdiomVectorize LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
- if (!LIT.run(&L))
+ LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
+ if (LITVecStyle.getNumOccurrences())
+ VecStyle = LITVecStyle;
+
+ unsigned BCVF = ByteCompareVF;
+ if (ByteCmpVF.getNumOccurrences())
+ BCVF = ByteCmpVF;
+
+ LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
+ if (!LIV.run(&L))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
@@ -360,14 +388,15 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
// Therefore, we know that we can use a 64-bit induction variable that
// starts from 0 -> ExtMaxLen and it will not overflow.
ScalableVectorType *PredVTy =
- ScalableVectorType::get(Builder.getInt1Ty(), 16);
+ ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
Value *InitialPred = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
- VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
- /*HasNUW=*/true, /*HasNSW=*/true);
+ VecLen =
+ Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
Builder.getInt1(false));
@@ -385,7 +414,8 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
- Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+ Type *VectorLoadType =
+ ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
Value *VectorLhsGep =
@@ -454,6 +484,121 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
return Builder.CreateTrunc(VectorLoopRes64, ResType);
}
+Value *LoopIdiomVectorize::createPredicatedFindMismatch(
+ IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
+ Type *I64Type = Builder.getInt64Ty();
+ Type *I32Type = Builder.getInt32Ty();
+ Type *ResType = I32Type;
+ Type *LoadType = Builder.getInt8Ty();
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // At this point we know two things must be true:
+ // 1. Start <= End
+ // 2. ExtMaxLen <= 4096 due to the page checks.
+ // Therefore, we know that we can use a 64-bit induction variable that
+ // starts from 0 -> ExtMaxLen and it will not overflow.
+ auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+ Builder.Insert(JumpToVectorLoop);
+
+ DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
+ VectorLoopStartBlock}});
+
+ // Set up the first Vector loop block by creating the PHIs, doing the vector
+ // loads and comparing the vectors.
+ Builder.SetInsertPoint(VectorLoopStartBlock);
+ auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
+ VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+
+ // Calculate AVL by subtracting the vector loop index from the trip count
+ Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
+ /*HasNSW=*/true);
+
+ auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
+ auto *VF = ConstantInt::get(
+ I32Type, VectorLoadType->getElementCount().getKnownMinValue());
+ auto *IsScalable = ConstantInt::getBool(
+ Builder.getContext(), VectorLoadType->getElementCount().isScalable());
+
+ Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
+ {I64Type}, {AVL, VF, IsScalable});
+ Value *GepOffset = VectorIndexPhi;
+
+ Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(VectorLhsGep)->setIsInBounds(true);
+ VectorType *TrueMaskTy =
+ VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
+ Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
+ Value *VectorLhsLoad = Builder.CreateIntrinsic(
+ Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+ {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
+
+ Value *VectorRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(VectorRhsGep)->setIsInBounds(true);
+ Value *VectorRhsLoad = Builder.CreateIntrinsic(
+ Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+ {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
+
+ StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
+ auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
+ Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
+ Value *VectorMatchCmp = Builder.CreateIntrinsic(
+ Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
+ {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
+ "mismatch.cmp");
+ Value *CTZ = Builder.CreateIntrinsic(
+ Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
+ {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask,
+ VL});
+ // RISC-V refines/lowers the poison returned by vp.cttz.elts to -1.
+ Value *MismatchFound =
+ Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0));
+ auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
+ VectorLoopIncBlock, MismatchFound);
+ Builder.Insert(VectorEarlyExit);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
+ {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
+
+ // Increment the index counter and calculate the predicate for the next
+ // iteration of the loop. We branch back to the start of the loop if there
+ // is at least one active lane.
+ Builder.SetInsertPoint(VectorLoopIncBlock);
+ Value *VL64 = Builder.CreateZExt(VL, I64Type);
+ Value *NewVectorIndexPhi =
+ Builder.CreateAdd(VectorIndexPhi, VL64, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+ Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
+ auto *VectorLoopBranchBack =
+ BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
+ Builder.Insert(VectorLoopBranchBack);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
+ {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
+
+ // If we found a mismatch then we need to calculate which lane in the vector
+ // had a mismatch and add that on to the current loop index.
+ Builder.SetInsertPoint(VectorLoopMismatchBlock);
+
+ // Add LCSSA phis for CTZ and VectorIndexPhi.
+ auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
+ CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
+ auto *VectorIndexLCSSAPhi =
+ Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
+ VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+ Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
+ Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
Value *LoopIdiomVectorize::expandFindMismatch(
IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -613,8 +758,17 @@ Value *LoopIdiomVectorize::expandFindMismatch(
// processed in each iteration, etc.
Builder.SetInsertPoint(VectorLoopPreheaderBlock);
- Value *VectorLoopRes =
- createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
+ Value *VectorLoopRes = nullptr;
+ switch (VectorizeStyle) {
+ case LoopIdiomVectorizeStyle::Masked:
+ VectorLoopRes =
+ createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
+ break;
+ case LoopIdiomVectorizeStyle::Predicated:
+ VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB,
+ ExtStart, ExtEnd);
+ break;
+ }
Builder.Insert(BranchInst::Create(EndBlock));
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
new file mode 100644
index 0000000000000..845daa402606f
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -0,0 +1,1751 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -loop-idiom-vectorize-bytecmp-vf=64 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
+; RUN: opt -passes='loop(loop-idiom-vectorize),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: byte.compare:
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_signed_wrap(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: byte.compare:
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]]
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add nsw i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
+
+
+define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; CHECK: while.found:
+; CHECK-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[END:%.*]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; CHECK: while.end:
+; CHECK-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[END]]
+; CHECK: end:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; CHECK-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; CHECK-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; CHECK-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; LMUL8: while.found:
+; LMUL8-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[END:%.*]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[END]]
+; LMUL8: end:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; LMUL8-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; LMUL8-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; LMUL8-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: byte.compare:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]]
+; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]]
+; LOOP-DEL-NEXT: store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
+; LOOP-DEL-NEXT: ret i32 [[SPEC_SELECT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; NO-TRANSFORM: while.found:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: br label [[END:%.*]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: br label [[END]]
+; NO-TRANSFORM: end:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; NO-TRANSFORM-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; NO-TRANSFORM-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; NO-TRANSFORM-NEXT: ret i32 [[MISMATCH_INDEX]]
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.found
+
+while.found:
+ %mismatch_index1 = phi i32 [ %inc, %while.body ]
+ %found_ptr = phi ptr [ %c, %while.body ]
+ br label %end
+
+while.end:
+ %mismatch_index2 = phi i32 [ %n, %while.cond ]
+ %end_ptr = phi ptr [ %d, %while.cond ]
+ br label %end
+
+end:
+ %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ]
+ %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ]
+ store i32 %mismatch_index, ptr %store_ptr
+ ret i32 %mismatch_index
+}
+
+
+
+define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
+; CHECK-LABEL: define i32 @compare_bytes_extra_cmp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; CHECK-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK: ph:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; CHECK: byte.compare:
+; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT]]
+; CHECK: while.end.loopexit:
+; CHECK-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LMUL8-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LMUL8: ph:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: br label [[WHILE_END_LOOPEXIT]]
+; LMUL8: while.end.loopexit:
+; LMUL8-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LOOP-DEL-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LOOP-DEL: ph:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; NO-TRANSFORM: ph:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]]
+entry:
+ %cmp.x = icmp ult i32 %n, %x
+ br i1 %cmp.x, label %ph, label %while.end
+
+ph:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
+ ret i32 %inc.lcssa
+}
+
+define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
+; CHECK-LABEL: define void @compare_bytes_cleanup_block(
+; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; CHECK-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; CHECK-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; CHECK: cleanup.thread:
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret void
+;
+; LMUL8-LABEL: define void @compare_bytes_cleanup_block(
+; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; LMUL8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; LMUL8-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; LMUL8-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; LMUL8-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; LMUL8-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; LMUL8-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; LMUL8-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; LMUL8-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; LMUL8-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; LMUL8-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; LMUL8-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; LMUL8-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; LMUL8-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; LMUL8-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; LMUL8-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; LMUL8-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; LMUL8-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; LMUL8-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; LMUL8-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; LMUL8: cleanup.thread:
+; LMUL8-NEXT: ret void
+; LMUL8: if.end:
+; LMUL8-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret void
+;
+; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block(
+; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]]
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]]
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]]
+; LOOP-DEL-NEXT: [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; LOOP-DEL-NEXT: [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]]
+; LOOP-DEL-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: common.ret:
+; LOOP-DEL-NEXT: ret void
+;
+; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
+; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; NO-TRANSFORM: cleanup.thread:
+; NO-TRANSFORM-NEXT: ret void
+; NO-TRANSFORM: if.end:
+; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: ret void
+entry:
+ br label %while.cond
+
+while.cond:
+ %len = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %inc = add i32 %len, 1
+ %cmp.not = icmp eq i32 %inc, 0
+ br i1 %cmp.not, label %cleanup.thread, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2, align 1
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %if.end
+
+cleanup.thread:
+ ret void
+
+if.end:
+ %res = phi i32 [ %inc, %while.body ]
+ ret void
+}
+
+;
+; NEGATIVE TESTS
+;
+
+; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI
+; with unique values for each incoming block from the loop.
+define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple2(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LMUL8-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+ store i32 %inc.lcssa, ptr %final_ptr
+ ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; CHECK-NEXT: ret i32 [[FINAL_VAL]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple3(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LMUL8-NEXT: ret i32 [[FINAL_VAL]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LOOP-DEL-NEXT: ret i32 [[FINAL_VAL]]
+;
+ entry:
+ br label %while.cond
+
+ while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+ while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+ while.end:
+ %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ]
+ store i32 %final_val, ptr %c
+ ret i32 %final_val
+}
+
+; Disable the optimization when noimplicitfloat is present.
+define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat {
+; CHECK-LABEL: define i32 @no_implicit_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @no_implicit_float(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @no_implicit_float(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
>From c3700ca7deb112114f87d15db00b367e225d7382 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 13 Jun 2024 15:14:06 -0700
Subject: [PATCH 2/6] fixup! [RISCV][LoopIdiomVectorize] Support VP intrinsics
in LoopIdiomVectorize
---
.../Transforms/Vectorize/LoopIdiomVectorize.cpp | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 58595007f55e6..c7459620c30ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -382,11 +382,6 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
Value *PtrA = GEPA->getPointerOperand();
Value *PtrB = GEPB->getPointerOperand();
- // At this point we know two things must be true:
- // 1. Start <= End
- // 2. ExtMaxLen <= MinPageSize due to the page checks.
- // Therefore, we know that we can use a 64-bit induction variable that
- // starts from 0 -> ExtMaxLen and it will not overflow.
ScalableVectorType *PredVTy =
ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
@@ -494,11 +489,6 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
Value *PtrA = GEPA->getPointerOperand();
Value *PtrB = GEPB->getPointerOperand();
- // At this point we know two things must be true:
- // 1. Start <= End
- // 2. ExtMaxLen <= 4096 due to the page checks.
- // Therefore, we know that we can use a 64-bit induction variable that
- // starts from 0 -> ExtMaxLen and it will not overflow.
auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
Builder.Insert(JumpToVectorLoop);
@@ -758,6 +748,11 @@ Value *LoopIdiomVectorize::expandFindMismatch(
// processed in each iteration, etc.
Builder.SetInsertPoint(VectorLoopPreheaderBlock);
+ // At this point we know two things must be true:
+ // 1. Start <= End
+ // 2. ExtMaxLen <= MinPageSize due to the page checks.
+ // Therefore, we know that we can use a 64-bit induction variable that
+ // starts from 0 -> ExtMaxLen and it will not overflow.
Value *VectorLoopRes = nullptr;
switch (VectorizeStyle) {
case LoopIdiomVectorizeStyle::Masked:
>From 3babd9829f08f694182a0dda85e5f5d38f9c8098 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 1 Jul 2024 10:42:07 -0700
Subject: [PATCH 3/6] Address review comments
---
.../Vectorize/LoopIdiomVectorize.cpp | 22 +++-----
.../LoopIdiom/RISCV/byte-compare-index.ll | 56 +++++++++----------
2 files changed, 36 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index c7459620c30ec..20b9edaa30181 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -506,18 +506,15 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
/*HasNSW=*/true);
auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
- auto *VF = ConstantInt::get(
- I32Type, VectorLoadType->getElementCount().getKnownMinValue());
- auto *IsScalable = ConstantInt::getBool(
- Builder.getContext(), VectorLoadType->getElementCount().isScalable());
+ auto *VF = ConstantInt::get(I32Type, ByteCompareVF);
+ auto *IsScalable = ConstantInt::getBool(Builder.getContext(), true);
Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
{I64Type}, {AVL, VF, IsScalable});
Value *GepOffset = VectorIndexPhi;
- Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
- if (GEPA->isInBounds())
- cast<GetElementPtrInst>(VectorLhsGep)->setIsInBounds(true);
+ Value *VectorLhsGep =
+ Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
VectorType *TrueMaskTy =
VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
@@ -525,9 +522,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
{VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
- Value *VectorRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
- if (GEPB->isInBounds())
- cast<GetElementPtrInst>(VectorRhsGep)->setIsInBounds(true);
+ Value *VectorRhsGep =
+ Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
Value *VectorRhsLoad = Builder.CreateIntrinsic(
Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
{VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
@@ -541,11 +537,9 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
"mismatch.cmp");
Value *CTZ = Builder.CreateIntrinsic(
Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
- {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask,
+ {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
VL});
- // RISC-V refines/lowers the poison returned by vp.cttz.elts to -1.
- Value *MismatchFound =
- Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0));
+ Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL);
auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
VectorLoopIncBlock, MismatchFound);
Builder.Insert(VectorEarlyExit);
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
index 845daa402606f..2cd3c4e3bb7ce 100644
--- a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -42,8 +42,8 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK: mismatch_vec_loop_inc:
; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -132,8 +132,8 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8: mismatch_vec_loop_inc:
; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -218,8 +218,8 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL: mismatch_vec_loop_inc:
; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -314,8 +314,8 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK: mismatch_vec_loop_inc:
; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -404,8 +404,8 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8: mismatch_vec_loop_inc:
; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -490,8 +490,8 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL: mismatch_vec_loop_inc:
; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -607,8 +607,8 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK: mismatch_vec_loop_inc:
; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -708,8 +708,8 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8: mismatch_vec_loop_inc:
; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -805,8 +805,8 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL: mismatch_vec_loop_inc:
; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -952,8 +952,8 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK: mismatch_vec_loop_inc:
; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -1048,8 +1048,8 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
-; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8: mismatch_vec_loop_inc:
; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -1140,8 +1140,8 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL: mismatch_vec_loop_inc:
; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
@@ -1259,8 +1259,8 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
-; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
-; CHECK-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i32 [[FIRST]], [[TMP15]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK: mismatch_vec_loop_inc:
; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
@@ -1348,8 +1348,8 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
; LMUL8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
-; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
-; LMUL8-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP18:%.*]] = icmp ne i32 [[FIRST]], [[TMP15]]
; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8: mismatch_vec_loop_inc:
; LMUL8-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
>From 49d491d2a030a9e960d2d8c1373ad80a0f12b741 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 1 Jul 2024 14:08:06 -0700
Subject: [PATCH 4/6] Add more tests and address review comments
---
.../Vectorize/LoopIdiomVectorize.cpp | 3 +-
.../RISCV/rvv/vfirst-byte-compare-index.ll | 178 ++++++
.../LoopIdiom/RISCV/byte-compare-index.ll | 558 ++++++++++++++++++
3 files changed, 737 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 20b9edaa30181..64e04cae2773f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -507,10 +507,9 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
auto *VF = ConstantInt::get(I32Type, ByteCompareVF);
- auto *IsScalable = ConstantInt::getBool(Builder.getContext(), true);
Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
- {I64Type}, {AVL, VF, IsScalable});
+ {I64Type}, {AVL, VF, Builder.getTrue()});
Value *GepOffset = VectorIndexPhi;
Value *VectorLhsGep =
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
new file mode 100644
index 0000000000000..cade25c2c05d8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+v < %s | FileCheck %s
+
+; Testing VFIRST patterns related to llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: compare_bytes_simple:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: addiw a5, a2, 1
+; CHECK-NEXT: bltu a4, a5, .LBB0_7
+; CHECK-NEXT: # %bb.1: # %mismatch_mem_check
+; CHECK-NEXT: slli a2, a5, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: slli a6, a3, 32
+; CHECK-NEXT: srli a6, a6, 32
+; CHECK-NEXT: add a7, a0, a2
+; CHECK-NEXT: add t0, a0, a6
+; CHECK-NEXT: srli a7, a7, 12
+; CHECK-NEXT: srli t0, t0, 12
+; CHECK-NEXT: bne a7, t0, .LBB0_7
+; CHECK-NEXT: # %bb.2: # %mismatch_mem_check
+; CHECK-NEXT: add a7, a1, a2
+; CHECK-NEXT: add t0, a1, a6
+; CHECK-NEXT: srli a7, a7, 12
+; CHECK-NEXT: srli t0, t0, 12
+; CHECK-NEXT: bne a7, t0, .LBB0_7
+; CHECK-NEXT: .LBB0_3: # %mismatch_vec_loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub a4, a6, a2
+; CHECK-NEXT: vsetvli a4, a4, e8, m2, ta, ma
+; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: vle8.v v8, (a5)
+; CHECK-NEXT: add a5, a1, a2
+; CHECK-NEXT: vle8.v v10, (a5)
+; CHECK-NEXT: vmsne.vv v12, v8, v10
+; CHECK-NEXT: vfirst.m a7, v12
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: bltz a7, .LBB0_5
+; CHECK-NEXT: # %bb.4: # %mismatch_vec_loop
+; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: mv a5, a7
+; CHECK-NEXT: .LBB0_5: # %mismatch_vec_loop
+; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: sext.w a7, a5
+; CHECK-NEXT: bne a7, a4, .LBB0_11
+; CHECK-NEXT: # %bb.6: # %mismatch_vec_loop_inc
+; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: bne a2, a6, .LBB0_3
+; CHECK-NEXT: j .LBB0_9
+; CHECK-NEXT: .LBB0_7: # %mismatch_loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: slli a2, a5, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: add a6, a0, a2
+; CHECK-NEXT: lbu a6, 0(a6)
+; CHECK-NEXT: add a2, a1, a2
+; CHECK-NEXT: lbu a2, 0(a2)
+; CHECK-NEXT: bne a6, a2, .LBB0_10
+; CHECK-NEXT: # %bb.8: # %mismatch_loop_inc
+; CHECK-NEXT: # in Loop: Header=BB0_7 Depth=1
+; CHECK-NEXT: addiw a5, a5, 1
+; CHECK-NEXT: bne a4, a5, .LBB0_7
+; CHECK-NEXT: .LBB0_9: # %while.end
+; CHECK-NEXT: mv a0, a3
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_10:
+; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_11: # %mismatch_vec_loop_found
+; CHECK-NEXT: slli a5, a5, 32
+; CHECK-NEXT: srli a3, a5, 32
+; CHECK-NEXT: add a0, a2, a3
+; CHECK-NEXT: ret
+entry:
+ %0 = add i32 %len, 1
+ br label %mismatch_min_it_check
+
+mismatch_min_it_check: ; preds = %entry
+ %1 = zext i32 %0 to i64
+ %2 = zext i32 %n to i64
+ %3 = icmp ule i32 %0, %n
+ br i1 %3, label %mismatch_mem_check, label %mismatch_loop_pre
+
+mismatch_mem_check: ; preds = %mismatch_min_it_check
+ %4 = getelementptr i8, ptr %a, i64 %1
+ %5 = getelementptr i8, ptr %b, i64 %1
+ %6 = ptrtoint ptr %5 to i64
+ %7 = ptrtoint ptr %4 to i64
+ %8 = getelementptr i8, ptr %a, i64 %2
+ %9 = getelementptr i8, ptr %b, i64 %2
+ %10 = ptrtoint ptr %8 to i64
+ %11 = ptrtoint ptr %9 to i64
+ %12 = lshr i64 %7, 12
+ %13 = lshr i64 %10, 12
+ %14 = lshr i64 %6, 12
+ %15 = lshr i64 %11, 12
+ %16 = icmp ne i64 %12, %13
+ %17 = icmp ne i64 %14, %15
+ %18 = or i1 %16, %17
+ br i1 %18, label %mismatch_loop_pre, label %mismatch_vec_loop_preheader
+
+mismatch_vec_loop_preheader: ; preds = %mismatch_mem_check
+ br label %mismatch_vec_loop
+
+mismatch_vec_loop: ; preds = %mismatch_vec_loop_inc, %mismatch_vec_loop_preheader
+ %mismatch_vector_index = phi i64 [ %1, %mismatch_vec_loop_preheader ], [ %25, %mismatch_vec_loop_inc ]
+ %avl = sub nuw nsw i64 %2, %mismatch_vector_index
+ %19 = call i32 @llvm.experimental.get.vector.length.i64(i64 %avl, i32 16, i1 true)
+ %20 = getelementptr inbounds i8, ptr %a, i64 %mismatch_vector_index
+ %lhs.load = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr %20, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %21 = getelementptr inbounds i8, ptr %b, i64 %mismatch_vector_index
+ %rhs.load = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr %21, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %mismatch.cmp = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> %lhs.load, <vscale x 16 x i8> %rhs.load, metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %22 = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %mismatch.cmp, i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %23 = icmp ne i32 %22, %19
+ br i1 %23, label %mismatch_vec_loop_found, label %mismatch_vec_loop_inc
+
+mismatch_vec_loop_inc: ; preds = %mismatch_vec_loop
+ %24 = zext i32 %19 to i64
+ %25 = add nuw nsw i64 %mismatch_vector_index, %24
+ %26 = icmp ne i64 %25, %2
+ br i1 %26, label %mismatch_vec_loop, label %mismatch_end
+
+mismatch_vec_loop_found: ; preds = %mismatch_vec_loop
+ %ctz = phi i32 [ %22, %mismatch_vec_loop ]
+ %mismatch_vector_index1 = phi i64 [ %mismatch_vector_index, %mismatch_vec_loop ]
+ %27 = zext i32 %ctz to i64
+ %28 = add nuw nsw i64 %mismatch_vector_index1, %27
+ %29 = trunc i64 %28 to i32
+ br label %mismatch_end
+
+mismatch_loop_pre: ; preds = %mismatch_mem_check, %mismatch_min_it_check
+ br label %mismatch_loop
+
+mismatch_loop: ; preds = %mismatch_loop_inc, %mismatch_loop_pre
+ %mismatch_index = phi i32 [ %0, %mismatch_loop_pre ], [ %36, %mismatch_loop_inc ]
+ %30 = zext i32 %mismatch_index to i64
+ %31 = getelementptr inbounds i8, ptr %a, i64 %30
+ %32 = load i8, ptr %31, align 1
+ %33 = getelementptr inbounds i8, ptr %b, i64 %30
+ %34 = load i8, ptr %33, align 1
+ %35 = icmp eq i8 %32, %34
+ br i1 %35, label %mismatch_loop_inc, label %mismatch_end
+
+mismatch_loop_inc: ; preds = %mismatch_loop
+ %36 = add i32 %mismatch_index, 1
+ %37 = icmp eq i32 %36, %n
+ br i1 %37, label %mismatch_end, label %mismatch_loop
+
+mismatch_end: ; preds = %mismatch_loop_inc, %mismatch_loop, %mismatch_vec_loop_found, %mismatch_vec_loop_inc
+ %mismatch_result = phi i32 [ %n, %mismatch_loop_inc ], [ %mismatch_index, %mismatch_loop ], [ %n, %mismatch_vec_loop_inc ], [ %29, %mismatch_vec_loop_found ]
+ br i1 true, label %byte.compare, label %while.cond
+
+while.cond: ; preds = %mismatch_end, %while.body
+ %len.addr = phi i32 [ %len, %mismatch_end ], [ %mismatch_result, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %mismatch_result, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body: ; preds = %while.cond
+ %idxprom = zext i32 %mismatch_result to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %38 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %39 = load i8, ptr %arrayidx2, align 1
+ %cmp.not2 = icmp eq i8 %38, %39
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+byte.compare: ; preds = %mismatch_end
+ br label %while.end
+
+while.end: ; preds = %byte.compare, %while.body, %while.cond
+ %inc.lcssa = phi i32 [ %mismatch_result, %while.body ], [ %mismatch_result, %while.cond ], [ %mismatch_result, %byte.compare ]
+ ret i32 %inc.lcssa
+}
+
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
index 2cd3c4e3bb7ce..8cf761055bd38 100644
--- a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -2,6 +2,7 @@
; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s
; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -loop-idiom-vectorize-bytecmp-vf=64 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
; RUN: opt -passes='loop(loop-idiom-vectorize),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=masked -mattr=+v -S < %s | FileCheck %s --check-prefix=MASKED
define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_simple(
@@ -252,6 +253,101 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
;
+; MASKED-LABEL: define i32 @compare_bytes_simple(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; MASKED-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; MASKED: mismatch_min_it_check:
+; MASKED-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; MASKED-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; MASKED-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; MASKED-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; MASKED: mismatch_mem_check:
+; MASKED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; MASKED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; MASKED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
+; MASKED-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
+; MASKED-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
+; MASKED-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
+; MASKED-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
+; MASKED-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
+; MASKED-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
+; MASKED-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
+; MASKED-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; MASKED-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; MASKED: mismatch_vec_loop_preheader:
+; MASKED-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; MASKED-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
+; MASKED-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; MASKED: mismatch_vec_loop:
+; MASKED-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; MASKED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
+; MASKED-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; MASKED-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
+; MASKED-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; MASKED: mismatch_vec_loop_inc:
+; MASKED-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
+; MASKED-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
+; MASKED-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; MASKED: mismatch_vec_loop_found:
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
+; MASKED-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; MASKED-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
+; MASKED-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
+; MASKED-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
+; MASKED-NEXT: br label [[MISMATCH_END]]
+; MASKED: mismatch_loop_pre:
+; MASKED-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; MASKED: mismatch_loop:
+; MASKED-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; MASKED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
+; MASKED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
+; MASKED-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
+; MASKED-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; MASKED: mismatch_loop_inc:
+; MASKED-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1
+; MASKED-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
+; MASKED-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; MASKED: mismatch_end:
+; MASKED-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
+; MASKED-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; MASKED: byte.compare:
+; MASKED-NEXT: br label [[WHILE_END]]
+; MASKED: while.end:
+; MASKED-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: ret i32 [[INC_LCSSA]]
+;
entry:
br label %while.cond
@@ -524,6 +620,101 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
;
+; MASKED-LABEL: define i32 @compare_bytes_signed_wrap(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; MASKED-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; MASKED: mismatch_min_it_check:
+; MASKED-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; MASKED-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; MASKED-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; MASKED-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; MASKED: mismatch_mem_check:
+; MASKED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; MASKED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; MASKED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
+; MASKED-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
+; MASKED-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
+; MASKED-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
+; MASKED-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
+; MASKED-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
+; MASKED-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
+; MASKED-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
+; MASKED-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; MASKED-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; MASKED: mismatch_vec_loop_preheader:
+; MASKED-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; MASKED-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
+; MASKED-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; MASKED: mismatch_vec_loop:
+; MASKED-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; MASKED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
+; MASKED-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; MASKED-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
+; MASKED-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; MASKED: mismatch_vec_loop_inc:
+; MASKED-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
+; MASKED-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
+; MASKED-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; MASKED: mismatch_vec_loop_found:
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
+; MASKED-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; MASKED-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
+; MASKED-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
+; MASKED-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
+; MASKED-NEXT: br label [[MISMATCH_END]]
+; MASKED: mismatch_loop_pre:
+; MASKED-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; MASKED: mismatch_loop:
+; MASKED-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; MASKED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
+; MASKED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
+; MASKED-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
+; MASKED-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; MASKED: mismatch_loop_inc:
+; MASKED-NEXT: [[TMP43]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; MASKED-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
+; MASKED-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; MASKED: mismatch_end:
+; MASKED-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
+; MASKED-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; MASKED: byte.compare:
+; MASKED-NEXT: br label [[WHILE_END]]
+; MASKED: while.end:
+; MASKED-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: ret i32 [[INC_LCSSA]]
+;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
; NO-TRANSFORM-NEXT: entry:
@@ -843,6 +1034,112 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; LOOP-DEL-NEXT: store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
; LOOP-DEL-NEXT: ret i32 [[SPEC_SELECT]]
;
+; MASKED-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; MASKED-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; MASKED: mismatch_min_it_check:
+; MASKED-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; MASKED-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; MASKED-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; MASKED-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; MASKED: mismatch_mem_check:
+; MASKED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; MASKED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; MASKED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
+; MASKED-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
+; MASKED-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
+; MASKED-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
+; MASKED-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
+; MASKED-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
+; MASKED-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
+; MASKED-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
+; MASKED-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; MASKED-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; MASKED: mismatch_vec_loop_preheader:
+; MASKED-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; MASKED-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
+; MASKED-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; MASKED: mismatch_vec_loop:
+; MASKED-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; MASKED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
+; MASKED-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; MASKED-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
+; MASKED-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; MASKED: mismatch_vec_loop_inc:
+; MASKED-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
+; MASKED-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
+; MASKED-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; MASKED: mismatch_vec_loop_found:
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
+; MASKED-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; MASKED-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
+; MASKED-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
+; MASKED-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
+; MASKED-NEXT: br label [[MISMATCH_END]]
+; MASKED: mismatch_loop_pre:
+; MASKED-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; MASKED: mismatch_loop:
+; MASKED-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; MASKED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
+; MASKED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
+; MASKED-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
+; MASKED-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; MASKED: mismatch_loop_inc:
+; MASKED-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX3]], 1
+; MASKED-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
+; MASKED-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; MASKED: mismatch_end:
+; MASKED-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
+; MASKED-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; MASKED: while.found:
+; MASKED-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: br label [[END:%.*]]
+; MASKED: byte.compare:
+; MASKED-NEXT: [[TMP47:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; MASKED-NEXT: br i1 [[TMP47]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; MASKED: while.end:
+; MASKED-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: br label [[END]]
+; MASKED: end:
+; MASKED-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; MASKED-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; MASKED-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; MASKED-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
; NO-TRANSFORM-NEXT: entry:
@@ -1174,6 +1471,107 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
;
+; MASKED-LABEL: define i32 @compare_bytes_extra_cmp(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; MASKED-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; MASKED: ph:
+; MASKED-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; MASKED-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; MASKED: mismatch_min_it_check:
+; MASKED-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; MASKED-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; MASKED-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; MASKED-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; MASKED: mismatch_mem_check:
+; MASKED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; MASKED-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; MASKED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; MASKED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; MASKED-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
+; MASKED-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
+; MASKED-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
+; MASKED-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
+; MASKED-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
+; MASKED-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
+; MASKED-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
+; MASKED-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
+; MASKED-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; MASKED-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; MASKED: mismatch_vec_loop_preheader:
+; MASKED-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; MASKED-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
+; MASKED-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; MASKED: mismatch_vec_loop:
+; MASKED-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; MASKED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
+; MASKED-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; MASKED-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
+; MASKED-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; MASKED: mismatch_vec_loop_inc:
+; MASKED-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
+; MASKED-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
+; MASKED-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
+; MASKED-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; MASKED: mismatch_vec_loop_found:
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
+; MASKED-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; MASKED-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
+; MASKED-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
+; MASKED-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
+; MASKED-NEXT: br label [[MISMATCH_END]]
+; MASKED: mismatch_loop_pre:
+; MASKED-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; MASKED: mismatch_loop:
+; MASKED-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; MASKED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
+; MASKED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
+; MASKED-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
+; MASKED-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
+; MASKED-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; MASKED: mismatch_loop_inc:
+; MASKED-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1
+; MASKED-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
+; MASKED-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; MASKED: mismatch_end:
+; MASKED-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
+; MASKED-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; MASKED: byte.compare:
+; MASKED-NEXT: br label [[WHILE_END_LOOPEXIT]]
+; MASKED: while.end.loopexit:
+; MASKED-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: br label [[WHILE_END]]
+; MASKED: while.end:
+; MASKED-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; MASKED-NEXT: ret i32 [[INC_LCSSA]]
+;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
; NO-TRANSFORM-NEXT: entry:
@@ -1422,6 +1820,100 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
; LOOP-DEL: common.ret:
; LOOP-DEL-NEXT: ret void
;
+; MASKED-LABEL: define void @compare_bytes_cleanup_block(
+; MASKED-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; MASKED: mismatch_min_it_check:
+; MASKED-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; MASKED: mismatch_mem_check:
+; MASKED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; MASKED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; MASKED-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; MASKED-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; MASKED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; MASKED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; MASKED-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; MASKED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; MASKED-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP3]], 12
+; MASKED-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP6]], 12
+; MASKED-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP2]], 12
+; MASKED-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; MASKED-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP8]], [[TMP9]]
+; MASKED-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
+; MASKED-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; MASKED-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; MASKED: mismatch_vec_loop_preheader:
+; MASKED-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 0)
+; MASKED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; MASKED-NEXT: [[TMP17:%.*]] = mul nuw nsw i64 [[TMP16]], 16
+; MASKED-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; MASKED: mismatch_vec_loop:
+; MASKED-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; MASKED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
+; MASKED-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT: [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
+; MASKED-NEXT: [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
+; MASKED-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
+; MASKED-NEXT: br i1 [[TMP24]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; MASKED: mismatch_vec_loop_inc:
+; MASKED-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP17]]
+; MASKED-NEXT: [[TMP26]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP25]], i64 0)
+; MASKED-NEXT: [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i64 0
+; MASKED-NEXT: br i1 [[TMP27]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; MASKED: mismatch_vec_loop_found:
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP23]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; MASKED-NEXT: [[TMP28:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
+; MASKED-NEXT: [[TMP29:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP28]], i1 true)
+; MASKED-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64
+; MASKED-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP30]]
+; MASKED-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
+; MASKED-NEXT: br label [[MISMATCH_END]]
+; MASKED: mismatch_loop_pre:
+; MASKED-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; MASKED: mismatch_loop:
+; MASKED-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP39:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; MASKED-NEXT: [[TMP33:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; MASKED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP33]]
+; MASKED-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
+; MASKED-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP33]]
+; MASKED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; MASKED-NEXT: [[TMP38:%.*]] = icmp eq i8 [[TMP35]], [[TMP37]]
+; MASKED-NEXT: br i1 [[TMP38]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; MASKED: mismatch_loop_inc:
+; MASKED-NEXT: [[TMP39]] = add i32 [[MISMATCH_INDEX]], 1
+; MASKED-NEXT: [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 0
+; MASKED-NEXT: br i1 [[TMP40]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; MASKED: mismatch_end:
+; MASKED-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_VEC_LOOP_FOUND]] ]
+; MASKED-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; MASKED-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; MASKED: byte.compare:
+; MASKED-NEXT: [[TMP43:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; MASKED-NEXT: br i1 [[TMP43]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; MASKED: cleanup.thread:
+; MASKED-NEXT: ret void
+; MASKED: if.end:
+; MASKED-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; MASKED-NEXT: ret void
+;
; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
; NO-TRANSFORM-NEXT: entry:
@@ -1546,6 +2038,29 @@ define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32
; LOOP-DEL-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
;
+; MASKED-LABEL: define i32 @compare_bytes_simple2(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: br label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; MASKED: while.end:
+; MASKED-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; MASKED-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; MASKED-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; MASKED-NEXT: ret i32 [[INC_LCSSA]]
+;
entry:
br label %while.cond
@@ -1637,6 +2152,28 @@ define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32
; LOOP-DEL-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LOOP-DEL-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
; LOOP-DEL-NEXT: ret i32 [[FINAL_VAL]]
+;
+; MASKED-LABEL: define i32 @compare_bytes_simple3(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: br label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; MASKED: while.end:
+; MASKED-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; MASKED-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; MASKED-NEXT: ret i32 [[FINAL_VAL]]
;
entry:
br label %while.cond
@@ -1727,6 +2264,27 @@ define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat
; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
;
+; MASKED-LABEL: define i32 @no_implicit_float(
+; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; MASKED-NEXT: entry:
+; MASKED-NEXT: br label [[WHILE_COND:%.*]]
+; MASKED: while.cond:
+; MASKED-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; MASKED-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; MASKED-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; MASKED-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; MASKED: while.body:
+; MASKED-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; MASKED-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; MASKED-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; MASKED-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; MASKED: while.end:
+; MASKED-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; MASKED-NEXT: ret i32 [[INC_LCSSA]]
+;
entry:
br label %while.cond
>From f4fc8b8a529546ef8612e30204df6c112a56c99c Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Tue, 2 Jul 2024 18:20:23 -0700
Subject: [PATCH 5/6] Fix signext argument attribute in codegen test
---
.../RISCV/rvv/vfirst-byte-compare-index.ll | 65 +++++++++----------
1 file changed, 32 insertions(+), 33 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
index cade25c2c05d8..3107d4e044cae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
@@ -3,74 +3,73 @@
; Testing VFIRST patterns related to llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
-define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 signext %len, i32 signext %n) {
; CHECK-LABEL: compare_bytes_simple:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: sext.w a4, a3
-; CHECK-NEXT: addiw a5, a2, 1
-; CHECK-NEXT: bltu a4, a5, .LBB0_7
+; CHECK-NEXT: addiw a4, a2, 1
+; CHECK-NEXT: bltu a3, a4, .LBB0_7
; CHECK-NEXT: # %bb.1: # %mismatch_mem_check
-; CHECK-NEXT: slli a2, a5, 32
+; CHECK-NEXT: slli a2, a4, 32
; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: slli a6, a3, 32
-; CHECK-NEXT: srli a6, a6, 32
-; CHECK-NEXT: add a7, a0, a2
-; CHECK-NEXT: add t0, a0, a6
+; CHECK-NEXT: slli a5, a3, 32
+; CHECK-NEXT: srli a5, a5, 32
+; CHECK-NEXT: add a6, a0, a2
+; CHECK-NEXT: add a7, a0, a5
+; CHECK-NEXT: srli a6, a6, 12
; CHECK-NEXT: srli a7, a7, 12
-; CHECK-NEXT: srli t0, t0, 12
-; CHECK-NEXT: bne a7, t0, .LBB0_7
+; CHECK-NEXT: bne a6, a7, .LBB0_7
; CHECK-NEXT: # %bb.2: # %mismatch_mem_check
-; CHECK-NEXT: add a7, a1, a2
-; CHECK-NEXT: add t0, a1, a6
+; CHECK-NEXT: add a6, a1, a2
+; CHECK-NEXT: add a7, a1, a5
+; CHECK-NEXT: srli a6, a6, 12
; CHECK-NEXT: srli a7, a7, 12
-; CHECK-NEXT: srli t0, t0, 12
-; CHECK-NEXT: bne a7, t0, .LBB0_7
+; CHECK-NEXT: bne a6, a7, .LBB0_7
; CHECK-NEXT: .LBB0_3: # %mismatch_vec_loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: sub a4, a6, a2
+; CHECK-NEXT: sub a4, a5, a2
; CHECK-NEXT: vsetvli a4, a4, e8, m2, ta, ma
-; CHECK-NEXT: add a5, a0, a2
-; CHECK-NEXT: vle8.v v8, (a5)
-; CHECK-NEXT: add a5, a1, a2
-; CHECK-NEXT: vle8.v v10, (a5)
+; CHECK-NEXT: add a6, a0, a2
+; CHECK-NEXT: vle8.v v8, (a6)
+; CHECK-NEXT: add a6, a1, a2
+; CHECK-NEXT: vle8.v v10, (a6)
; CHECK-NEXT: vmsne.vv v12, v8, v10
; CHECK-NEXT: vfirst.m a7, v12
-; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: mv a6, a4
; CHECK-NEXT: bltz a7, .LBB0_5
; CHECK-NEXT: # %bb.4: # %mismatch_vec_loop
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: mv a5, a7
+; CHECK-NEXT: mv a6, a7
; CHECK-NEXT: .LBB0_5: # %mismatch_vec_loop
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: sext.w a7, a5
+; CHECK-NEXT: sext.w a7, a6
; CHECK-NEXT: bne a7, a4, .LBB0_11
; CHECK-NEXT: # %bb.6: # %mismatch_vec_loop_inc
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: add a2, a2, a4
-; CHECK-NEXT: bne a2, a6, .LBB0_3
+; CHECK-NEXT: bne a2, a5, .LBB0_3
; CHECK-NEXT: j .LBB0_9
; CHECK-NEXT: .LBB0_7: # %mismatch_loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: slli a2, a5, 32
+; CHECK-NEXT: slli a2, a4, 32
; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: add a6, a0, a2
-; CHECK-NEXT: lbu a6, 0(a6)
+; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: lbu a5, 0(a5)
; CHECK-NEXT: add a2, a1, a2
; CHECK-NEXT: lbu a2, 0(a2)
-; CHECK-NEXT: bne a6, a2, .LBB0_10
+; CHECK-NEXT: bne a5, a2, .LBB0_10
; CHECK-NEXT: # %bb.8: # %mismatch_loop_inc
; CHECK-NEXT: # in Loop: Header=BB0_7 Depth=1
-; CHECK-NEXT: addiw a5, a5, 1
-; CHECK-NEXT: bne a4, a5, .LBB0_7
+; CHECK-NEXT: addiw a4, a4, 1
+; CHECK-NEXT: bne a3, a4, .LBB0_7
; CHECK-NEXT: .LBB0_9: # %while.end
; CHECK-NEXT: mv a0, a3
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_10:
-; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: mv a0, a4
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_11: # %mismatch_vec_loop_found
-; CHECK-NEXT: slli a5, a5, 32
-; CHECK-NEXT: srli a3, a5, 32
+; CHECK-NEXT: slli a6, a6, 32
+; CHECK-NEXT: srli a3, a6, 32
; CHECK-NEXT: add a0, a2, a3
; CHECK-NEXT: ret
entry:
>From d4fb4c068dad36da2d0bc789aa1b8fc21750a9fc Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Tue, 2 Jul 2024 18:43:02 -0700
Subject: [PATCH 6/6] Fix the TargetMachine::registerPassBuilder hook
---
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 3 +--
llvm/lib/Target/RISCV/RISCVTargetMachine.h | 3 +--
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 50f4920d74799..c132a6ef9611c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -574,8 +574,7 @@ void RISCVPassConfig::addPostRegAlloc() {
addPass(createRISCVRedundantCopyEliminationPass());
}
-void RISCVTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
OptimizationLevel Level) {
LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 7111d5ec80e47..ce7b7907e1f3a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -59,8 +59,7 @@ class RISCVTargetMachine : public LLVMTargetMachine {
PerFunctionMIParsingState &PFS,
SMDiagnostic &Error,
SMRange &SourceRange) const override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
} // namespace llvm
More information about the llvm-commits
mailing list