[llvm] [RISCV][LoopIdiom] Support VP intrinsics in LoopIdiomTransform (PR #94082)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Fri May 31 16:54:37 PDT 2024
https://github.com/mshockwave created https://github.com/llvm/llvm-project/pull/94082
Teach LoopIdiomTransform to generate VP intrinsics to replace the byte compare loops. Right now RISC-V is the only user of this style.
--------
This PR stacks on top of #94081
>From b1420ba9126b9789a6111f80540929b9fa5580e7 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 23 May 2024 15:49:29 -0700
Subject: [PATCH 1/2] [AArch64][LoopIdiom] Generalize AArch64LoopIdiomTransform
into LoopIdiomTransform
To facilitate sharing LoopIdiomTransform between AArch64 and RISC-V,
this patch first moves AArch64LoopIdiomTransform from lib/Target/AArch64
to lib/Transforms/Vectorize. In addition, key component that is subject
to differ from RVV's vectorization style is factored out preemptively
in this patch.
---
.../Vectorize/LoopIdiomTransform.h} | 14 +-
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Target/AArch64/AArch64.h | 1 -
.../Target/AArch64/AArch64PassRegistry.def | 20 -
.../Target/AArch64/AArch64TargetMachine.cpp | 8 +-
.../lib/Target/AArch64/AArch64TargetMachine.h | 1 -
llvm/lib/Target/AArch64/CMakeLists.txt | 2 +-
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 +
.../Vectorize/LoopIdiomTransform.cpp} | 443 ++++++++----------
.../LoopIdiom/AArch64/byte-compare-index.ll | 405 ++++++++--------
11 files changed, 402 insertions(+), 495 deletions(-)
rename llvm/{lib/Target/AArch64/AArch64LoopIdiomTransform.h => include/llvm/Transforms/Vectorize/LoopIdiomTransform.h} (60%)
delete mode 100644 llvm/lib/Target/AArch64/AArch64PassRegistry.def
rename llvm/lib/{Target/AArch64/AArch64LoopIdiomTransform.cpp => Transforms/Vectorize/LoopIdiomTransform.cpp} (71%)
diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h
similarity index 60%
rename from llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h
rename to llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h
index cc68425bb68b5..a97dcc7ae3a3f 100644
--- a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h
@@ -1,4 +1,4 @@
-//===- AArch64LoopIdiomTransform.h --------------------------------------===//
+//===----------LoopIdiomTransform.h -----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,20 +6,16 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H
+#ifndef LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H
+#define LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H
#include "llvm/IR/PassManager.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
namespace llvm {
-
-struct AArch64LoopIdiomTransformPass
- : PassInfoMixin<AArch64LoopIdiomTransformPass> {
+struct LoopIdiomTransformPass : PassInfoMixin<LoopIdiomTransformPass> {
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
};
-
} // namespace llvm
-
-#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H
+#endif // LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 734ca4d5deec9..bf11146a05e5a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -295,6 +295,7 @@
#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/Transforms/Vectorize/VectorCombine.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 50682ca4970f1..714058f91bfc6 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -621,6 +621,7 @@ LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
LOOP_PASS("loop-bound-split", LoopBoundSplitPass())
LOOP_PASS("loop-deletion", LoopDeletionPass())
LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
+LOOP_PASS("loop-idiom-transform", LoopIdiomTransformPass())
LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
LOOP_PASS("loop-predication", LoopPredicationPass())
LOOP_PASS("loop-reduce", LoopStrengthReducePass())
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index b70fbe42fe5fc..19e0d1e2f5960 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -90,7 +90,6 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
void initializeAArch64ExpandPseudoPass(PassRegistry &);
void initializeAArch64GlobalsTaggingPass(PassRegistry &);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
-void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
void initializeAArch64MIPeepholeOptPass(PassRegistry &);
void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64PassRegistry.def b/llvm/lib/Target/AArch64/AArch64PassRegistry.def
deleted file mode 100644
index ca944579f93a9..0000000000000
--- a/llvm/lib/Target/AArch64/AArch64PassRegistry.def
+++ /dev/null
@@ -1,20 +0,0 @@
-//===- AArch64PassRegistry.def - Registry of AArch64 passes -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is used as the registry of passes that are part of the
-// AArch64 backend.
-//
-//===----------------------------------------------------------------------===//
-
-// NOTE: NO INCLUDE GUARD DESIRED!
-
-#ifndef LOOP_PASS
-#define LOOP_PASS(NAME, CREATE_PASS)
-#endif
-LOOP_PASS("aarch64-lit", AArch64LoopIdiomTransformPass())
-#undef LOOP_PASS
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 945ab5cf1f303..a6e26501541f3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,7 +11,6 @@
#include "AArch64TargetMachine.h"
#include "AArch64.h"
-#include "AArch64LoopIdiomTransform.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64MachineScheduler.h"
#include "AArch64MacroFusion.h"
@@ -52,6 +51,7 @@
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/CFGuard.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h"
#include <memory>
#include <optional>
#include <string>
@@ -234,7 +234,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64DeadRegisterDefinitionsPass(*PR);
initializeAArch64ExpandPseudoPass(*PR);
initializeAArch64LoadStoreOptPass(*PR);
- initializeAArch64LoopIdiomTransformLegacyPassPass(*PR);
initializeAArch64MIPeepholeOptPass(*PR);
initializeAArch64SIMDInstrOptPass(*PR);
initializeAArch64O0PreLegalizerCombinerPass(*PR);
@@ -553,12 +552,9 @@ class AArch64PassConfig : public TargetPassConfig {
void AArch64TargetMachine::registerPassBuilderCallbacks(
PassBuilder &PB, bool PopulateClassToPassNames) {
-#define GET_PASS_REGISTRY "AArch64PassRegistry.def"
-#include "llvm/Passes/TargetPassRegistry.inc"
-
PB.registerLateLoopOptimizationsEPCallback(
[=](LoopPassManager &LPM, OptimizationLevel Level) {
- LPM.addPass(AArch64LoopIdiomTransformPass());
+ LPM.addPass(LoopIdiomTransformPass());
});
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 8fb68b06f1378..e396d9204716a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -14,7 +14,6 @@
#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
#include "AArch64InstrInfo.h"
-#include "AArch64LoopIdiomTransform.h"
#include "AArch64Subtarget.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 8e76f6c9279e7..639bc0707dff2 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -65,7 +65,6 @@ add_llvm_target(AArch64CodeGen
AArch64ISelLowering.cpp
AArch64InstrInfo.cpp
AArch64LoadStoreOptimizer.cpp
- AArch64LoopIdiomTransform.cpp
AArch64LowerHomogeneousPrologEpilog.cpp
AArch64MachineFunctionInfo.cpp
AArch64MachineScheduler.cpp
@@ -112,6 +111,7 @@ add_llvm_target(AArch64CodeGen
Target
TargetParser
TransformUtils
+ Vectorize
ADD_TO_COMPONENT
AArch64
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9674094024b9e..3ca5c404d020f 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,5 +1,6 @@
add_llvm_component_library(LLVMVectorize
LoadStoreVectorizer.cpp
+ LoopIdiomTransform.cpp
LoopVectorizationLegality.cpp
LoopVectorize.cpp
SLPVectorizer.cpp
diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp
similarity index 71%
rename from llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp
rename to llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp
index a9bd8d877fb2e..5af1d6aa3b61e 100644
--- a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp
@@ -1,4 +1,4 @@
-//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===//
+//===-------- LoopIdiomTransform.cpp - Loop idiom recognition -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -35,7 +35,8 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -44,47 +45,46 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
using namespace PatternMatch;
-#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+#define DEBUG_TYPE "loop-idiom-transform"
-static cl::opt<bool>
- DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
- cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
-
-static cl::opt<bool> DisableByteCmp(
- "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
- cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
- "not convert byte-compare loop(s)."));
-
-static cl::opt<bool> VerifyLoops(
- "aarch64-lit-verify", cl::Hidden, cl::init(false),
- cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+static cl::opt<bool> DisableAll("disable-loop-idiom-transform-all", cl::Hidden,
+ cl::init(false),
+ cl::desc("Disable Loop Idiom Transform Pass."));
-namespace llvm {
-
-void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
-Pass *createAArch64LoopIdiomTransformPass();
+static cl::opt<bool>
+ DisableByteCmp("disable-loop-idiom-transform-bytecmp", cl::Hidden,
+ cl::init(false),
+ cl::desc("Proceed with Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
-} // end namespace llvm
+static cl::opt<bool>
+ VerifyLoops("verify-loop-idiom-transform", cl::Hidden, cl::init(false),
+ cl::desc("Verify loops generated Loop Idiom Transform Pass."));
namespace {
-
-class AArch64LoopIdiomTransform {
+class LoopIdiomTransform {
Loop *CurLoop = nullptr;
DominatorTree *DT;
LoopInfo *LI;
const TargetTransformInfo *TTI;
const DataLayout *DL;
+ // Blocks that will be used for inserting vectorized code.
+ BasicBlock *EndBlock = nullptr;
+ BasicBlock *VectorLoopPreheaderBlock = nullptr;
+ BasicBlock *VectorLoopStartBlock = nullptr;
+ BasicBlock *VectorLoopMismatchBlock = nullptr;
+ BasicBlock *VectorLoopIncBlock = nullptr;
+
public:
- explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
- const TargetTransformInfo *TTI,
- const DataLayout *DL)
+ explicit LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
: DT(DT), LI(LI), TTI(TTI), DL(DL) {}
bool run(Loop *L);
@@ -98,83 +98,32 @@ class AArch64LoopIdiomTransform {
SmallVectorImpl<BasicBlock *> &ExitBlocks);
bool recognizeByteCompare();
+
Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
Instruction *Index, Value *Start, Value *MaxLen);
+
+ Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Value *ExtStart,
+ Value *ExtEnd);
+
void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
Value *Start, bool IncIdx, BasicBlock *FoundBB,
BasicBlock *EndBB);
/// @}
};
+} // anonymous namespace
-class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
-public:
- static char ID;
-
- explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
- initializeAArch64LoopIdiomTransformLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override {
- return "Transform AArch64-specific loop idioms";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-};
-
-bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
- LPPassManager &LPM) {
-
- if (skipLoop(L))
- return false;
-
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent());
- return AArch64LoopIdiomTransform(
- DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
- .run(L);
-}
-
-} // end anonymous namespace
-
-char AArch64LoopIdiomTransformLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
- AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
- "Transform specific loop idioms into optimized vector forms", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(
- AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
- "Transform specific loop idioms into optimized vector forms", false, false)
-
-Pass *llvm::createAArch64LoopIdiomTransformPass() {
- return new AArch64LoopIdiomTransformLegacyPass();
-}
-
-PreservedAnalyses
-AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
+PreservedAnalyses LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
if (DisableAll)
return PreservedAnalyses::all();
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
- AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
+ LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
if (!LIT.run(&L))
return PreservedAnalyses::all();
@@ -183,11 +132,11 @@ AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
//===----------------------------------------------------------------------===//
//
-// Implementation of AArch64LoopIdiomTransform
+// Implementation of LoopIdiomTransform
//
//===----------------------------------------------------------------------===//
-bool AArch64LoopIdiomTransform::run(Loop *L) {
+bool LoopIdiomTransform::run(Loop *L) {
CurLoop = L;
Function &F = *L->getHeader()->getParent();
@@ -211,7 +160,7 @@ bool AArch64LoopIdiomTransform::run(Loop *L) {
return recognizeByteCompare();
}
-bool AArch64LoopIdiomTransform::recognizeByteCompare() {
+bool LoopIdiomTransform::recognizeByteCompare() {
// Currently the transformation only works on scalable vector types, although
// there is no fundamental reason why it cannot be made to work for fixed
// width too.
@@ -224,7 +173,7 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() {
BasicBlock *Header = CurLoop->getHeader();
- // In AArch64LoopIdiomTransform::run we have already checked that the loop
+ // In LoopIdiomTransform::run we have already checked that the loop
// has a preheader so we can assume it's in a canonical form.
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
return false;
@@ -242,8 +191,7 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() {
// %cmp.not = icmp eq i32 %inc, %n
// br i1 %cmp.not, label %while.end, label %while.body
//
- auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug();
- if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4)
+ if (LoopBlocks[0]->sizeWithoutDebug() > 4)
return false;
// The second block should contain 7 instructions, e.g.
@@ -257,8 +205,7 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() {
// %cmp.not.ld = icmp eq i8 %load.a, %load.b
// br i1 %cmp.not.ld, label %while.cond, label %while.end
//
- auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug();
- if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7)
+ if (LoopBlocks[1]->sizeWithoutDebug() > 7)
return false;
// The incoming value to the PHI node from the loop should be an add of 1.
@@ -393,7 +340,109 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() {
return true;
}
-Value *AArch64LoopIdiomTransform::expandFindMismatch(
+Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder,
+ GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB,
+ Value *ExtStart,
+ Value *ExtEnd) {
+ Type *I64Type = Builder.getInt64Ty();
+ Type *ResType = Builder.getInt32Ty();
+ Type *LoadType = Builder.getInt8Ty();
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // At this point we know two things must be true:
+ // 1. Start <= End
+ // 2. ExtMaxLen <= MinPageSize due to the page checks.
+ // Therefore, we know that we can use a 64-bit induction variable that
+ // starts from 0 -> ExtMaxLen and it will not overflow.
+ ScalableVectorType *PredVTy =
+ ScalableVectorType::get(Builder.getInt1Ty(), 16);
+
+ Value *InitialPred = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
+
+ Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
+ VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+
+ Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
+ Builder.getInt1(false));
+
+ BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+ Builder.Insert(JumpToVectorLoop);
+
+ // Set up the first vector loop block by creating the PHIs, doing the vector
+ // loads and comparing the vectors.
+ Builder.SetInsertPoint(VectorLoopStartBlock);
+ PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
+ LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
+ PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
+ VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+ Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+ Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
+
+ Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(VectorLhsGep)->setIsInBounds(true);
+ Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
+ Align(1), LoopPred, Passthru);
+
+ Value *VectorRhsGep = Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(VectorRhsGep)->setIsInBounds(true);
+ Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
+ Align(1), LoopPred, Passthru);
+
+ Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
+ VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
+ Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
+ BranchInst *VectorEarlyExit = BranchInst::Create(
+ VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
+ Builder.Insert(VectorEarlyExit);
+
+ // Increment the index counter and calculate the predicate for the next
+ // iteration of the loop. We branch back to the start of the loop if there
+ // is at least one active lane.
+ Builder.SetInsertPoint(VectorLoopIncBlock);
+ Value *NewVectorIndexPhi =
+ Builder.CreateAdd(VectorIndexPhi, VecLen, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+ Value *NewPred =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+ {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
+ LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
+
+ Value *PredHasActiveLanes =
+ Builder.CreateExtractElement(NewPred, uint64_t(0));
+ BranchInst *VectorLoopBranchBack =
+ BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
+ Builder.Insert(VectorLoopBranchBack);
+
+ // If we found a mismatch then we need to calculate which lane in the vector
+ // had a mismatch and add that on to the current loop index.
+ Builder.SetInsertPoint(VectorLoopMismatchBlock);
+ PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
+ FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
+ PHINode *LastLoopPred =
+ Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
+ LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
+ PHINode *VectorFoundIndex =
+ Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
+ VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+ Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
+ Value *Ctz = Builder.CreateIntrinsic(
+ Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
+ {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
+ Ctz = Builder.CreateZExt(Ctz, I64Type);
+ Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
+Value *LoopIdiomTransform::expandFindMismatch(
IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
Value *PtrA = GEPA->getPointerOperand();
@@ -407,17 +456,16 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
Type *ResType = Builder.getInt32Ty();
// Split block in the original loop preheader.
- BasicBlock *EndBlock =
- SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
+ EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
// Create the blocks that we're going to need:
// 1. A block for checking the zero-extended length exceeds 0
// 2. A block to check that the start and end addresses of a given array
// lie on the same page.
- // 3. The SVE loop preheader.
- // 4. The first SVE loop block.
- // 5. The SVE loop increment block.
- // 6. A block we can jump to from the SVE loop when a mismatch is found.
+ // 3. The vector loop preheader.
+ // 4. The first vector loop block.
+ // 5. The vector loop increment block.
+ // 6. A block we can jump to from the vector loop when a mismatch is found.
// 7. The first block of the scalar loop itself, containing PHIs , loads
// and cmp.
// 8. A scalar loop increment block to increment the PHIs and go back
@@ -432,17 +480,17 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
BasicBlock *MemCheckBlock = BasicBlock::Create(
Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
- BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create(
- Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock);
+ VectorLoopPreheaderBlock = BasicBlock::Create(
+ Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
- BasicBlock *SVELoopStartBlock = BasicBlock::Create(
- Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock);
+ VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
+ EndBlock->getParent(), EndBlock);
- BasicBlock *SVELoopIncBlock = BasicBlock::Create(
- Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock);
+ VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
+ EndBlock->getParent(), EndBlock);
- BasicBlock *SVELoopMismatchBlock = BasicBlock::Create(
- Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock);
+ VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
+ EndBlock->getParent(), EndBlock);
BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
@@ -456,26 +504,27 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
{DominatorTree::Delete, Preheader, EndBlock}});
- // Update LoopInfo with the new SVE & scalar loops.
- auto SVELoop = LI->AllocateLoop();
+ // Update LoopInfo with the new vector & scalar loops.
+ auto VectorLoop = LI->AllocateLoop();
auto ScalarLoop = LI->AllocateLoop();
if (CurLoop->getParentLoop()) {
CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI);
CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI);
- CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI);
- CurLoop->getParentLoop()->addChildLoop(SVELoop);
- CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopMismatchBlock, *LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock,
+ *LI);
+ CurLoop->getParentLoop()->addChildLoop(VectorLoop);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI);
CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI);
CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
} else {
- LI->addTopLevelLoop(SVELoop);
+ LI->addTopLevelLoop(VectorLoop);
LI->addTopLevelLoop(ScalarLoop);
}
// Add the new basic blocks to their associated loops.
- SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI);
- SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI);
+ VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI);
+ VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI);
ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI);
ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI);
@@ -497,10 +546,6 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
Builder.Insert(MinItCheckBr);
- DTU.applyUpdates(
- {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
- {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
-
// For each of the arrays, check the start/end addresses are on the same
// page.
Builder.SetInsertPoint(MemCheckBlock);
@@ -537,131 +582,26 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
- LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp);
+ LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp);
CombinedPageCmpCmpBr->setMetadata(
LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
.createBranchWeights(10, 90));
Builder.Insert(CombinedPageCmpCmpBr);
- DTU.applyUpdates(
- {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
- {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}});
-
- // Set up the SVE loop preheader, i.e. calculate initial loop predicate,
+ // Set up the vector loop preheader, i.e. calculate initial loop predicate,
// zero-extend MaxLen to 64-bits, determine the number of vector elements
// processed in each iteration, etc.
- Builder.SetInsertPoint(SVELoopPreheaderBlock);
-
- // At this point we know two things must be true:
- // 1. Start <= End
- // 2. ExtMaxLen <= MinPageSize due to the page checks.
- // Therefore, we know that we can use a 64-bit induction variable that
- // starts from 0 -> ExtMaxLen and it will not overflow.
- ScalableVectorType *PredVTy =
- ScalableVectorType::get(Builder.getInt1Ty(), 16);
-
- Value *InitialPred = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
-
- Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
- VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
- /*HasNUW=*/true, /*HasNSW=*/true);
-
- Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
- Builder.getInt1(false));
-
- BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock);
- Builder.Insert(JumpToSVELoop);
+ Builder.SetInsertPoint(VectorLoopPreheaderBlock);
- DTU.applyUpdates(
- {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}});
-
- // Set up the first SVE loop block by creating the PHIs, doing the vector
- // loads and comparing the vectors.
- Builder.SetInsertPoint(SVELoopStartBlock);
- PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred");
- LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock);
- PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index");
- SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock);
- Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
- Value *Passthru = ConstantInt::getNullValue(SVELoadType);
-
- Value *SVELhsGep = Builder.CreateGEP(LoadType, PtrA, SVEIndexPhi);
- if (GEPA->isInBounds())
- cast<GetElementPtrInst>(SVELhsGep)->setIsInBounds(true);
- Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1),
- LoopPred, Passthru);
-
- Value *SVERhsGep = Builder.CreateGEP(LoadType, PtrB, SVEIndexPhi);
- if (GEPB->isInBounds())
- cast<GetElementPtrInst>(SVERhsGep)->setIsInBounds(true);
- Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1),
- LoopPred, Passthru);
-
- Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad);
- SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse);
- Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp);
- BranchInst *SVEEarlyExit = BranchInst::Create(
- SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes);
- Builder.Insert(SVEEarlyExit);
-
- DTU.applyUpdates(
- {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock},
- {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}});
-
- // Increment the index counter and calculate the predicate for the next
- // iteration of the loop. We branch back to the start of the loop if there
- // is at least one active lane.
- Builder.SetInsertPoint(SVELoopIncBlock);
- Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "",
- /*HasNUW=*/true, /*HasNSW=*/true);
- SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock);
- Value *NewPred =
- Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
- {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd});
- LoopPred->addIncoming(NewPred, SVELoopIncBlock);
-
- Value *PredHasActiveLanes =
- Builder.CreateExtractElement(NewPred, uint64_t(0));
- BranchInst *SVELoopBranchBack =
- BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes);
- Builder.Insert(SVELoopBranchBack);
-
- DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock},
- {DominatorTree::Insert, SVELoopIncBlock, EndBlock}});
-
- // If we found a mismatch then we need to calculate which lane in the vector
- // had a mismatch and add that on to the current loop index.
- Builder.SetInsertPoint(SVELoopMismatchBlock);
- PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_sve_found_pred");
- FoundPred->addIncoming(SVEMatchCmp, SVELoopStartBlock);
- PHINode *LastLoopPred =
- Builder.CreatePHI(PredVTy, 1, "mismatch_sve_last_loop_pred");
- LastLoopPred->addIncoming(LoopPred, SVELoopStartBlock);
- PHINode *SVEFoundIndex =
- Builder.CreatePHI(I64Type, 1, "mismatch_sve_found_index");
- SVEFoundIndex->addIncoming(SVEIndexPhi, SVELoopStartBlock);
-
- Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
- Value *Ctz = Builder.CreateIntrinsic(
- Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
- {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
- Ctz = Builder.CreateZExt(Ctz, I64Type);
- Value *SVELoopRes64 = Builder.CreateAdd(SVEFoundIndex, Ctz, "",
- /*HasNUW=*/true, /*HasNSW=*/true);
- Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType);
+ Value *VectorLoopRes =
+ createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
Builder.Insert(BranchInst::Create(EndBlock));
- DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}});
-
// Generate code for scalar loop.
Builder.SetInsertPoint(LoopPreHeaderBlock);
Builder.Insert(BranchInst::Create(LoopStartBlock));
- DTU.applyUpdates(
- {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
-
Builder.SetInsertPoint(LoopStartBlock);
PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
@@ -685,9 +625,6 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
Builder.Insert(MatchCmpBr);
- DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
- {DominatorTree::Insert, LoopStartBlock, EndBlock}});
-
// Have we reached the maximum permitted length for the loop?
Builder.SetInsertPoint(LoopIncBlock);
Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
@@ -698,29 +635,26 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
Builder.Insert(IVCmpBr);
- DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
- {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
-
// In the end block we need to insert a PHI node to deal with three cases:
// 1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
// 2. We exitted the scalar loop early due to a mismatch and need to return
// the index that we found.
- // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen.
- // 4. We exitted the SVE loop early due to a mismatch and need to return
+ // 3. We didn't find a mismatch in the vector loop, so we return MaxLen.
+ // 4. We exitted the vector loop early due to a mismatch and need to return
// the index that we found.
Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
ResPhi->addIncoming(MaxLen, LoopIncBlock);
ResPhi->addIncoming(IndexPhi, LoopStartBlock);
- ResPhi->addIncoming(MaxLen, SVELoopIncBlock);
- ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock);
+ ResPhi->addIncoming(MaxLen, VectorLoopIncBlock);
+ ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock);
Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType);
if (VerifyLoops) {
ScalarLoop->verifyLoop();
- SVELoop->verifyLoop();
- if (!SVELoop->isRecursivelyLCSSAForm(*DT, *LI))
+ VectorLoop->verifyLoop();
+ if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI))
report_fatal_error("Loops must remain in LCSSA form!");
if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI))
report_fatal_error("Loops must remain in LCSSA form!");
@@ -729,10 +663,12 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch(
return FinalRes;
}
-void AArch64LoopIdiomTransform::transformByteCompare(
- GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi,
- Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx,
- BasicBlock *FoundBB, BasicBlock *EndBB) {
+void LoopIdiomTransform::transformByteCompare(GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB,
+ PHINode *IndPhi, Value *MaxLen,
+ Instruction *Index, Value *Start,
+ bool IncIdx, BasicBlock *FoundBB,
+ BasicBlock *EndBB) {
// Insert the byte compare code at the end of the preheader block
BasicBlock *Preheader = CurLoop->getLoopPreheader();
@@ -742,6 +678,11 @@ void AArch64LoopIdiomTransform::transformByteCompare(
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+ // Safeguard to check if we build the correct DomTree with DTU.
+ auto CheckDTU = llvm::make_scope_exit([&]() {
+ assert(DTU.getDomTree().verify() && "Ill-formed DomTree built by DTU");
+ });
+
// Increment the pointer if this was done before the loads in the loop.
if (IncIdx)
Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
@@ -777,12 +718,8 @@ void AArch64LoopIdiomTransform::transformByteCompare(
if (FoundBB != EndBB) {
Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
- DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
- {DominatorTree::Insert, CmpBB, EndBB}});
-
} else {
Builder.CreateBr(FoundBB);
- DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
}
auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
index 27ab11446b571..3e73c4653902f 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
@@ -1,10 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s
-; RUN: opt -aarch64-lit -simplifycfg -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL
-; RUN: opt -aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM
-; RUN: opt -p aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s
-; RUN: opt -passes='function(loop(aarch64-lit)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL
-; RUN: opt -p aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM
+; RUN: opt -p loop-idiom-transform -verify-loop-idiom-transform -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s
+; RUN: opt -passes='function(loop(loop-idiom-transform)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+; RUN: opt -p loop-idiom-transform -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM
define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_simple(
@@ -33,36 +30,36 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK: mismatch_sve_loop_preheader:
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: mismatch_vec_loop_preheader:
; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; CHECK: mismatch_sve_loop:
-; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; CHECK: mismatch_sve_loop_inc:
-; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; CHECK-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]]
-; CHECK: mismatch_sve_loop_found:
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; CHECK-NEXT: br label [[MISMATCH_END]]
; CHECK: mismatch_loop_pre:
@@ -81,7 +78,7 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK: mismatch_end:
-; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK: while.cond:
; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
@@ -128,36 +125,36 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
-; LOOP-DEL: mismatch_sve_loop_preheader:
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; LOOP-DEL: mismatch_vec_loop_preheader:
; LOOP-DEL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; LOOP-DEL: mismatch_sve_loop:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; LOOP-DEL: mismatch_sve_loop_inc:
-; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; LOOP-DEL-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]]
-; LOOP-DEL: mismatch_sve_loop_found:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; LOOP-DEL-NEXT: br label [[WHILE_END]]
; LOOP-DEL: mismatch_loop_pre:
@@ -176,7 +173,7 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
; LOOP-DEL: while.end:
-; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: [[RES:%.*]] = add i32 [[MISMATCH_RESULT]], [[EXTRA]]
; LOOP-DEL-NEXT: ret i32 [[RES]]
;
@@ -256,36 +253,36 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; CHECK: mismatch_sve_loop_preheader:
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; CHECK: mismatch_sve_loop:
-; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; CHECK: mismatch_sve_loop_inc:
-; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; CHECK-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]]
-; CHECK: mismatch_sve_loop_found:
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; CHECK-NEXT: br label [[MISMATCH_END]]
; CHECK: mismatch_loop_pre:
@@ -304,7 +301,7 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK: mismatch_end:
-; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK: while.cond:
; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
@@ -349,36 +346,36 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; LOOP-DEL: mismatch_sve_loop_preheader:
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop_preheader:
; LOOP-DEL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; LOOP-DEL: mismatch_sve_loop:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; LOOP-DEL: mismatch_sve_loop_inc:
-; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; LOOP-DEL-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]]
-; LOOP-DEL: mismatch_sve_loop_found:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; LOOP-DEL-NEXT: br label [[WHILE_END]]
; LOOP-DEL: mismatch_loop_pre:
@@ -397,7 +394,7 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
; LOOP-DEL: while.end:
-; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
@@ -472,36 +469,36 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; CHECK: mismatch_sve_loop_preheader:
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; CHECK: mismatch_sve_loop:
-; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; CHECK: mismatch_sve_loop_inc:
-; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; CHECK-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]]
-; CHECK: mismatch_sve_loop_found:
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; CHECK-NEXT: br label [[MISMATCH_END]]
; CHECK: mismatch_loop_pre:
@@ -520,7 +517,7 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK: mismatch_end:
-; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK: while.cond:
; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
@@ -576,36 +573,36 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; LOOP-DEL: mismatch_sve_loop_preheader:
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop_preheader:
; LOOP-DEL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; LOOP-DEL: mismatch_sve_loop:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; LOOP-DEL: mismatch_sve_loop_inc:
-; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; LOOP-DEL-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[BYTE_COMPARE:%.*]]
-; LOOP-DEL: mismatch_sve_loop_found:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[BYTE_COMPARE:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]]
; LOOP-DEL: mismatch_loop_pre:
@@ -624,7 +621,7 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
; LOOP-DEL: byte.compare:
-; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: [[TMP45:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP45]], i32 [[N]], i32 [[MISMATCH_RESULT]]
; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP45]], ptr [[D]], ptr [[C]]
@@ -729,36 +726,36 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; CHECK: mismatch_sve_loop_preheader:
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; CHECK: mismatch_sve_loop:
-; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; CHECK: mismatch_sve_loop_inc:
-; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; CHECK-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]]
-; CHECK: mismatch_sve_loop_found:
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; CHECK-NEXT: br label [[MISMATCH_END]]
; CHECK: mismatch_loop_pre:
@@ -777,7 +774,7 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK: mismatch_end:
-; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK: while.cond:
; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
@@ -828,36 +825,36 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
-; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; LOOP-DEL: mismatch_sve_loop_preheader:
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop_preheader:
; LOOP-DEL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
-; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; LOOP-DEL: mismatch_sve_loop:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]]
-; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
-; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
-; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; LOOP-DEL: mismatch_sve_loop_inc:
-; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]]
+; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; LOOP-DEL-NEXT: [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
-; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END]]
-; LOOP-DEL: mismatch_sve_loop_found:
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
-; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]]
+; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; LOOP-DEL-NEXT: br label [[WHILE_END]]
; LOOP-DEL: mismatch_loop_pre:
@@ -876,7 +873,7 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
; LOOP-DEL: while.end:
-; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
@@ -960,36 +957,36 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
-; CHECK: mismatch_sve_loop_preheader:
+; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 0)
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP17:%.*]] = mul nuw nsw i64 [[TMP16]], 16
-; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]]
-; CHECK: mismatch_sve_loop:
-; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_SVE_LOOP_INC]] ]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_SVE_INDEX]]
-; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
+; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; CHECK-NEXT: [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_SVE_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
-; CHECK-NEXT: br i1 [[TMP24]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]]
-; CHECK: mismatch_sve_loop_inc:
-; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP17]]
; CHECK-NEXT: [[TMP26]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP25]], i64 0)
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i64 0
-; CHECK-NEXT: br i1 [[TMP27]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]]
-; CHECK: mismatch_sve_loop_found:
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP23]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = and <vscale x 16 x i1> [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]]
+; CHECK-NEXT: br i1 [[TMP27]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP23]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP28]], i1 true)
; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64
-; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP30]]
; CHECK-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
; CHECK-NEXT: br label [[MISMATCH_END]]
; CHECK: mismatch_loop_pre:
@@ -1008,7 +1005,7 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 0
; CHECK-NEXT: br i1 [[TMP40]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK: mismatch_end:
-; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_SVE_LOOP_FOUND]] ]
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK: while.cond:
; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
>From 764aac0d12e0d66841d2807f3b6a6489009ca125 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 30 May 2024 11:10:07 -0700
Subject: [PATCH 2/2] [RISCV][LoopIdiom] Support VP intrinsics in
LoopIdiomTransform
Teach LoopIdiomTransform to use VP intrinsics to replace the byte
compare loops. Right now only RISC-V uses LoopIdiomTransform of this
style.
---
.../Transforms/Vectorize/LoopIdiomTransform.h | 17 +-
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 10 +
llvm/lib/Target/RISCV/RISCVTargetMachine.h | 2 +
.../Target/RISCV/RISCVTargetTransformInfo.h | 2 +
.../Vectorize/LoopIdiomTransform.cpp | 168 +-
.../LoopIdiom/RISCV/byte-compare-index.ll | 1751 +++++++++++++++++
6 files changed, 1938 insertions(+), 12 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h
index a97dcc7ae3a3f..866bf7e72e406 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h
@@ -13,7 +13,22 @@
#include "llvm/Transforms/Scalar/LoopPassManager.h"
namespace llvm {
-struct LoopIdiomTransformPass : PassInfoMixin<LoopIdiomTransformPass> {
+enum class LoopIdiomTransformStyle { Masked, Predicated };
+
+class LoopIdiomTransformPass : public PassInfoMixin<LoopIdiomTransformPass> {
+ LoopIdiomTransformStyle VectorizeStyle = LoopIdiomTransformStyle::Masked;
+
+ // The VF used in vectorizing the byte compare pattern.
+ unsigned ByteCompareVF = 16;
+
+public:
+ LoopIdiomTransformPass() = default;
+ explicit LoopIdiomTransformPass(LoopIdiomTransformStyle S)
+ : VectorizeStyle(S) {}
+
+ LoopIdiomTransformPass(LoopIdiomTransformStyle S, unsigned BCVF)
+ : VectorizeStyle(S), ByteCompareVF(BCVF) {}
+
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
};
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index d9f8222669cab..f380a69b5e7e0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -33,10 +33,12 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h"
#include <optional>
using namespace llvm;
@@ -576,6 +578,14 @@ void RISCVPassConfig::addPostRegAlloc() {
addPass(createRISCVRedundantCopyEliminationPass());
}
+void RISCVTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
+ PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
+ OptimizationLevel Level) {
+ LPM.addPass(LoopIdiomTransformPass(LoopIdiomTransformStyle::Predicated));
+ });
+}
+
yaml::MachineFunctionInfo *
RISCVTargetMachine::createDefaultFuncInfoYAML() const {
return new yaml::RISCVMachineFunctionInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index 68dfb3c81f2fe..7111d5ec80e47 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -59,6 +59,8 @@ class RISCVTargetMachine : public LLVMTargetMachine {
PerFunctionMIParsingState &PFS,
SMDiagnostic &Error,
SMRange &SourceRange) const override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index a4d1390875095..073779e07b513 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -397,6 +397,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
bool shouldFoldTerminatingConditionAfterLSR() const {
return true;
}
+
+ std::optional<unsigned> getMinPageSize() const { return 4096; }
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp
index 5af1d6aa3b61e..c034797a97fc3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp
@@ -56,18 +56,34 @@ static cl::opt<bool> DisableAll("disable-loop-idiom-transform-all", cl::Hidden,
cl::init(false),
cl::desc("Disable Loop Idiom Transform Pass."));
+static cl::opt<LoopIdiomTransformStyle>
+ LITVecStyle("loop-idiom-transform-style", cl::Hidden,
+ cl::desc("The vectorization style for loop idiom transform."),
+ cl::values(clEnumValN(LoopIdiomTransformStyle::Masked, "masked",
+ "Use masked vector intrinsics"),
+ clEnumValN(LoopIdiomTransformStyle::Predicated,
+ "predicated", "Use VP intrinsics")),
+ cl::init(LoopIdiomTransformStyle::Masked));
+
static cl::opt<bool>
DisableByteCmp("disable-loop-idiom-transform-bytecmp", cl::Hidden,
cl::init(false),
cl::desc("Proceed with Loop Idiom Transform Pass, but do "
"not convert byte-compare loop(s)."));
+static cl::opt<unsigned>
+ ByteCmpVF("loop-idiom-transform-bytecmp-vf", cl::Hidden,
+ cl::desc("The vectorization factor for byte-compare patterns."),
+ cl::init(16));
+
static cl::opt<bool>
VerifyLoops("verify-loop-idiom-transform", cl::Hidden, cl::init(false),
cl::desc("Verify loops generated Loop Idiom Transform Pass."));
namespace {
class LoopIdiomTransform {
+ LoopIdiomTransformStyle VectorizeStyle;
+ unsigned ByteCompareVF;
Loop *CurLoop = nullptr;
DominatorTree *DT;
LoopInfo *LI;
@@ -82,10 +98,11 @@ class LoopIdiomTransform {
BasicBlock *VectorLoopIncBlock = nullptr;
public:
- explicit LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
- const TargetTransformInfo *TTI,
- const DataLayout *DL)
- : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+ LoopIdiomTransform(LoopIdiomTransformStyle S, unsigned VF, DominatorTree *DT,
+ LoopInfo *LI, const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+ : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
+ }
bool run(Loop *L);
@@ -106,6 +123,10 @@ class LoopIdiomTransform {
Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
GetElementPtrInst *GEPB, Value *ExtStart,
Value *ExtEnd);
+ Value *createPredicatedFindMismatch(IRBuilder<> &Builder,
+ GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Value *ExtStart,
+ Value *ExtEnd);
void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
@@ -123,7 +144,15 @@ PreservedAnalyses LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
- LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
+ LoopIdiomTransformStyle VecStyle = VectorizeStyle;
+ if (LITVecStyle.getNumOccurrences())
+ VecStyle = LITVecStyle;
+
+ unsigned BCVF = ByteCompareVF;
+ if (ByteCmpVF.getNumOccurrences())
+ BCVF = ByteCmpVF;
+
+ LoopIdiomTransform LIT(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
if (!LIT.run(&L))
return PreservedAnalyses::all();
@@ -357,14 +386,15 @@ Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder,
// Therefore, we know that we can use a 64-bit induction variable that
// starts from 0 -> ExtMaxLen and it will not overflow.
ScalableVectorType *PredVTy =
- ScalableVectorType::get(Builder.getInt1Ty(), 16);
+ ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
Value *InitialPred = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
- VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
- /*HasNUW=*/true, /*HasNSW=*/true);
+ VecLen =
+ Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
Builder.getInt1(false));
@@ -379,7 +409,8 @@ Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder,
LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
- Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+ Type *VectorLoadType =
+ ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi);
@@ -442,6 +473,112 @@ Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder,
return Builder.CreateTrunc(VectorLoopRes64, ResType);
}
+Value *LoopIdiomTransform::createPredicatedFindMismatch(IRBuilder<> &Builder,
+ GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB,
+ Value *ExtStart,
+ Value *ExtEnd) {
+ Type *I64Type = Builder.getInt64Ty();
+ Type *I32Type = Builder.getInt32Ty();
+ Type *ResType = I32Type;
+ Type *LoadType = Builder.getInt8Ty();
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // At this point we know two things must be true:
+ // 1. Start <= End
+ // 2. ExtMaxLen <= 4096 due to the page checks.
+ // Therefore, we know that we can use a 64-bit induction variable that
+ // starts from 0 -> ExtMaxLen and it will not overflow.
+ auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+ Builder.Insert(JumpToVectorLoop);
+
+ // Set up the first Vector loop block by creating the PHIs, doing the vector
+ // loads and comparing the vectors.
+ Builder.SetInsertPoint(VectorLoopStartBlock);
+ auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
+ VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+
+ // Calculate AVL by subtracting the vector loop index from the trip count
+ Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
+ /*HasNSW=*/true);
+
+ auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
+ auto *VF = ConstantInt::get(
+ I32Type, VectorLoadType->getElementCount().getKnownMinValue());
+ auto *IsScalable = ConstantInt::getBool(
+ Builder.getContext(), VectorLoadType->getElementCount().isScalable());
+
+ Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
+ {I64Type}, {AVL, VF, IsScalable});
+ Value *GepOffset = VectorIndexPhi;
+
+ Value *VectorLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(VectorLhsGep)->setIsInBounds(true);
+ VectorType *TrueMaskTy =
+ VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
+ Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
+ Value *VectorLhsLoad = Builder.CreateIntrinsic(
+ Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+ {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
+
+ Value *VectorRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(VectorRhsGep)->setIsInBounds(true);
+ Value *VectorRhsLoad = Builder.CreateIntrinsic(
+ Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
+ {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
+
+ StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
+ auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
+ Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
+ Value *VectorMatchCmp = Builder.CreateIntrinsic(
+ Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
+ {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
+ "mismatch.cmp");
+ Value *CTZ = Builder.CreateIntrinsic(
+ Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
+ {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask,
+ VL});
+ // RISC-V refines/lowers the poison returned by vp.cttz.elts to -1.
+ Value *MismatchFound =
+ Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0));
+ auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
+ VectorLoopIncBlock, MismatchFound);
+ Builder.Insert(VectorEarlyExit);
+
+ // Increment the index counter and calculate the predicate for the next
+ // iteration of the loop. We branch back to the start of the loop if there
+ // is at least one active lane.
+ Builder.SetInsertPoint(VectorLoopIncBlock);
+ Value *VL64 = Builder.CreateZExt(VL, I64Type);
+ Value *NewVectorIndexPhi =
+ Builder.CreateAdd(VectorIndexPhi, VL64, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+ Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
+ auto *VectorLoopBranchBack =
+ BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
+ Builder.Insert(VectorLoopBranchBack);
+
+ // If we found a mismatch then we need to calculate which lane in the vector
+ // had a mismatch and add that on to the current loop index.
+ Builder.SetInsertPoint(VectorLoopMismatchBlock);
+
+ // Add LCSSA phis for CTZ and VectorIndexPhi.
+ auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
+ CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
+ auto *VectorIndexLCSSAPhi =
+ Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
+ VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+ Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
+ Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
Value *LoopIdiomTransform::expandFindMismatch(
IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -593,8 +730,17 @@ Value *LoopIdiomTransform::expandFindMismatch(
// processed in each iteration, etc.
Builder.SetInsertPoint(VectorLoopPreheaderBlock);
- Value *VectorLoopRes =
- createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
+ Value *VectorLoopRes = nullptr;
+ switch (VectorizeStyle) {
+ case LoopIdiomTransformStyle::Masked:
+ VectorLoopRes =
+ createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
+ break;
+ case LoopIdiomTransformStyle::Predicated:
+ VectorLoopRes =
+ createPredicatedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
+ break;
+ }
Builder.Insert(BranchInst::Create(EndBlock));
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
new file mode 100644
index 0000000000000..11bc4d86d1881
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -0,0 +1,1751 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-idiom-transform -mtriple=riscv64-unknown-linux-gnu -loop-idiom-transform-style=predicated -mattr=+v -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom-transform -mtriple=riscv64-unknown-linux-gnu -loop-idiom-transform-style=predicated -loop-idiom-transform-bytecmp-vf=64 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
+; RUN: opt -passes='loop(loop-idiom-transform),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -loop-idiom-transform-style=predicated -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: byte.compare:
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_signed_wrap(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: byte.compare:
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]]
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add nsw i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
+
+
+define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; CHECK: while.found:
+; CHECK-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[END:%.*]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; CHECK: while.end:
+; CHECK-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[END]]
+; CHECK: end:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; CHECK-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; CHECK-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; CHECK-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; LMUL8: while.found:
+; LMUL8-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[END:%.*]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[END]]
+; LMUL8: end:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; LMUL8-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; LMUL8-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; LMUL8-NEXT: ret i32 [[MISMATCH_INDEX]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: byte.compare:
+; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]]
+; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]]
+; LOOP-DEL-NEXT: store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
+; LOOP-DEL-NEXT: ret i32 [[SPEC_SELECT]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
+; NO-TRANSFORM: while.found:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: br label [[END:%.*]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ]
+; NO-TRANSFORM-NEXT: br label [[END]]
+; NO-TRANSFORM: end:
+; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
+; NO-TRANSFORM-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
+; NO-TRANSFORM-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
+; NO-TRANSFORM-NEXT: ret i32 [[MISMATCH_INDEX]]
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.found
+
+while.found:
+ %mismatch_index1 = phi i32 [ %inc, %while.body ]
+ %found_ptr = phi ptr [ %c, %while.body ]
+ br label %end
+
+while.end:
+ %mismatch_index2 = phi i32 [ %n, %while.cond ]
+ %end_ptr = phi ptr [ %d, %while.cond ]
+ br label %end
+
+end:
+ %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ]
+ %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ]
+ store i32 %mismatch_index, ptr %store_ptr
+ ret i32 %mismatch_index
+}
+
+
+
+define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
+; CHECK-LABEL: define i32 @compare_bytes_extra_cmp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; CHECK-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK: ph:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; CHECK: byte.compare:
+; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT]]
+; CHECK: while.end.loopexit:
+; CHECK-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LMUL8-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LMUL8: ph:
+; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
+; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: br label [[WHILE_END_LOOPEXIT]]
+; LMUL8: while.end.loopexit:
+; LMUL8-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: br label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; LOOP-DEL-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; LOOP-DEL: ph:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LOOP-DEL: mismatch_mem_check:
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
+; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
+; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
+; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
+; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
+; LOOP-DEL: mismatch_vec_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
+; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
+; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0
+; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LOOP-DEL: mismatch_vec_loop_inc:
+; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
+; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
+; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_vec_loop_found:
+; LOOP-DEL-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
+; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
+; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; LOOP-DEL-NEXT: br label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_pre:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
+; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
+; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
+; LOOP-DEL: mismatch_loop_inc:
+; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
+; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
+; NO-TRANSFORM: ph:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM: while.end:
+; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]]
+entry:
+ %cmp.x = icmp ult i32 %n, %x
+ br i1 %cmp.x, label %ph, label %while.end
+
+ph:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
+ ret i32 %inc.lcssa
+}
+
+define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
+; CHECK-LABEL: define void @compare_bytes_cleanup_block(
+; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; CHECK: mismatch_min_it_check:
+; CHECK-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; CHECK: mismatch_mem_check:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; CHECK: mismatch_vec_loop_preheader:
+; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; CHECK: mismatch_vec_loop:
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; CHECK-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; CHECK: mismatch_vec_loop_inc:
+; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; CHECK: mismatch_vec_loop_found:
+; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; CHECK-NEXT: br label [[MISMATCH_END]]
+; CHECK: mismatch_loop_pre:
+; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; CHECK: mismatch_loop:
+; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; CHECK-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; CHECK: mismatch_loop_inc:
+; CHECK-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; CHECK-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; CHECK: mismatch_end:
+; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; CHECK: byte.compare:
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; CHECK-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; CHECK: cleanup.thread:
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT: ret void
+;
+; LMUL8-LABEL: define void @compare_bytes_cleanup_block(
+; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]]
+; LMUL8: mismatch_min_it_check:
+; LMUL8-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
+; LMUL8: mismatch_mem_check:
+; LMUL8-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
+; LMUL8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
+; LMUL8-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LMUL8-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; LMUL8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
+; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
+; LMUL8-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; LMUL8-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
+; LMUL8-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
+; LMUL8-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
+; LMUL8-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
+; LMUL8-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
+; LMUL8-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
+; LMUL8-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; LMUL8-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
+; LMUL8: mismatch_vec_loop_preheader:
+; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]]
+; LMUL8: mismatch_vec_loop:
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
+; LMUL8-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
+; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 true, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
+; LMUL8-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0
+; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
+; LMUL8: mismatch_vec_loop_inc:
+; LMUL8-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
+; LMUL8-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
+; LMUL8-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
+; LMUL8-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
+; LMUL8: mismatch_vec_loop_found:
+; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
+; LMUL8-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
+; LMUL8-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
+; LMUL8-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; LMUL8-NEXT: br label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_pre:
+; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LMUL8: mismatch_loop:
+; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
+; LMUL8-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LMUL8-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
+; LMUL8-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; LMUL8-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
+; LMUL8-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; LMUL8-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
+; LMUL8-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
+; LMUL8: mismatch_loop_inc:
+; LMUL8-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
+; LMUL8-NEXT: [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
+; LMUL8-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
+; LMUL8: mismatch_end:
+; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
+; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
+; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; LMUL8: byte.compare:
+; LMUL8-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
+; LMUL8-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
+; LMUL8: cleanup.thread:
+; LMUL8-NEXT: ret void
+; LMUL8: if.end:
+; LMUL8-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; LMUL8-NEXT: ret void
+;
+; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block(
+; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]]
+; LOOP-DEL: mismatch_loop:
+; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]]
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]]
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]]
+; LOOP-DEL-NEXT: [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; LOOP-DEL-NEXT: [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]]
+; LOOP-DEL-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]]
+; LOOP-DEL: common.ret:
+; LOOP-DEL-NEXT: ret void
+;
+; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
+; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; NO-TRANSFORM-NEXT: entry:
+; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]]
+; NO-TRANSFORM: while.cond:
+; NO-TRANSFORM-NEXT: [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN]], 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
+; NO-TRANSFORM: while.body:
+; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
+; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
+; NO-TRANSFORM: cleanup.thread:
+; NO-TRANSFORM-NEXT: ret void
+; NO-TRANSFORM: if.end:
+; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT: ret void
+entry:
+ br label %while.cond
+
+while.cond:
+ %len = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %inc = add i32 %len, 1
+ %cmp.not = icmp eq i32 %inc, 0
+ br i1 %cmp.not, label %cleanup.thread, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2, align 1
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %if.end
+
+cleanup.thread:
+ ret void
+
+if.end:
+ %res = phi i32 [ %inc, %while.body ]
+ ret void
+}
+
+;
+; NEGATIVE TESTS
+;
+
+; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI
+; with unique values for each incoming block from the loop.
+define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple2(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LMUL8-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+ store i32 %inc.lcssa, ptr %final_ptr
+ ret i32 %inc.lcssa
+}
+
+define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) {
+; CHECK-LABEL: define i32 @compare_bytes_simple3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; CHECK-NEXT: ret i32 [[FINAL_VAL]]
+;
+; LMUL8-LABEL: define i32 @compare_bytes_simple3(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LMUL8-NEXT: ret i32 [[FINAL_VAL]]
+;
+; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4
+; LOOP-DEL-NEXT: ret i32 [[FINAL_VAL]]
+;
+ entry:
+ br label %while.cond
+
+ while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+ while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+ while.end:
+ %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ]
+ store i32 %final_val, ptr %c
+ ret i32 %final_val
+}
+
+; Disable the optimization when noimplicitfloat is present.
+define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat {
+; CHECK-LABEL: define i32 @no_implicit_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[WHILE_COND:%.*]]
+; CHECK: while.cond:
+; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK: while.body:
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; CHECK-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LMUL8-LABEL: define i32 @no_implicit_float(
+; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LMUL8-NEXT: entry:
+; LMUL8-NEXT: br label [[WHILE_COND:%.*]]
+; LMUL8: while.cond:
+; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LMUL8: while.body:
+; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LMUL8: while.end:
+; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LMUL8-NEXT: ret i32 [[INC_LCSSA]]
+;
+; LOOP-DEL-LABEL: define i32 @no_implicit_float(
+; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]]
+; LOOP-DEL: while.cond:
+; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
+; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1
+; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; LOOP-DEL: while.body:
+; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
+; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; LOOP-DEL: while.end:
+; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
+; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]]
+;
+entry:
+ br label %while.cond
+
+while.cond:
+ %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
+ %inc = add i32 %len.addr, 1
+ %cmp.not = icmp eq i32 %inc, %n
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:
+ %idxprom = zext i32 %inc to i64
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+ %0 = load i8, ptr %arrayidx
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
+ %1 = load i8, ptr %arrayidx2
+ %cmp.not2 = icmp eq i8 %0, %1
+ br i1 %cmp.not2, label %while.cond, label %while.end
+
+while.end:
+ %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
+ ret i32 %inc.lcssa
+}
More information about the llvm-commits
mailing list