[llvm] [AArch64][SVE] Use loop.dependence.war.mask in vector.memcheck (PR #175943)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 11 06:02:02 PST 2026
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/175943
>From 7e01bdd180c9f439bc8343481dd27ae661e07ef5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 12 Jan 2026 14:55:44 +0000
Subject: [PATCH] [AArch64][SVE] Use `loop.dependence.war.mask` in
vector.memcheck
This patch updates `addDiffRuntimeChecks()` to use the last element
of a `loop.dependence.war.mask` for the check when it is "cheap". This
is currently determined by the cost model returning a cost <= 1.
Using the last element of a `loop.dependence.war.mask` has semantics
similar to the current diff checks, allows for the legal `Diff == 0`
case.
The current checks are equivalent to:
```
if (Diff < 0 || Diff >= AccessSize * (VF * IC))
// vector loop
```
Whereas using `loop.dependence.war.mask` is equivalent to:
```
if (Diff <= 0 || Diff > AccessSize * (VF * IC - 1))
// vector loop
```
Note: `Diff >= AccessSize * (VF * IC)` is the same as
`Diff > AccessSize * (VF * IC - 1)` assuming aligned pointers.
On AArch64 with SVE2 this allows the diff checks to be lowered to:
```
whilewr p0.s, x1, x2
b.nlast .Lscalar_loop
```
Note: This is similar to https://reviews.llvm.org/D138652
WIP
Fixups
Rm
Update checks
---
.../llvm/Analysis/LoopAccessAnalysis.h | 5 +-
.../llvm/Analysis/TargetTransformInfoImpl.h | 3 +
.../include/llvm/Transforms/Utils/LoopUtils.h | 3 +-
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 7 +-
llvm/lib/Transforms/Utils/LoopUtils.cpp | 64 +++-
.../Transforms/Vectorize/LoopVectorize.cpp | 31 +-
.../CostModel/loop-dep-mask-no_info.ll | 47 +++
.../LoopVectorize/AArch64/alias_mask.ll | 347 ++++++++++++++++++
.../AArch64/conditional-branches-cost.ll | 4 +-
.../AArch64/induction-costs-sve.ll | 4 +-
.../AArch64/scalable-struct-return.ll | 6 +-
.../LoopVectorize/AArch64/sve-fneg.ll | 2 +-
.../LoopVectorize/AArch64/sve-multiexit.ll | 4 +-
.../sve-runtime-check-size-based-threshold.ll | 10 +-
.../AArch64/sve-vector-reverse.ll | 2 +-
.../LoopVectorize/AArch64/sve2-histcnt.ll | 27 +-
.../LoopVectorize/RISCV/fminimumnum.ll | 48 +--
.../RISCV/riscv-vector-reverse.ll | 12 +-
.../LoopVectorize/RISCV/strided-accesses.ll | 4 +-
.../RISCV/tail-folding-bin-unary-ops-args.ll | 72 ++--
.../RISCV/tail-folding-call-intrinsics.ll | 52 +--
.../RISCV/tail-folding-cast-intrinsics.ll | 20 +-
...ntime-checks-difference-simplifications.ll | 233 ++++++++++++
.../scalable-first-order-recurrence.ll | 8 +-
24 files changed, 850 insertions(+), 165 deletions(-)
create mode 100644 llvm/test/Analysis/CostModel/loop-dep-mask-no_info.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index c85ef3e131068..92ea3b53d59b4 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -491,11 +491,12 @@ struct PointerDiffInfo {
const SCEV *SinkStart;
unsigned AccessSize;
bool NeedsFreeze;
+ Align AccessAlign;
PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
- unsigned AccessSize, bool NeedsFreeze)
+ unsigned AccessSize, bool NeedsFreeze, Align AccessAlign)
: SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
- NeedsFreeze(NeedsFreeze) {}
+ NeedsFreeze(NeedsFreeze), AccessAlign(AccessAlign) {}
};
/// Holds information about the memory runtime legality checks to verify
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 5ef18fecabd99..0983850341a8c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -905,6 +905,9 @@ class TargetTransformInfoImplBase {
switch (ICA.getID()) {
default:
break;
+ case Intrinsic::loop_dependence_raw_mask:
+ case Intrinsic::loop_dependence_war_mask:
+ return 10;
case Intrinsic::allow_runtime_check:
case Intrinsic::allow_ubsan_check:
case Intrinsic::annotation:
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 1e48eeca72952..2f3317e533e12 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -634,7 +634,8 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
LLVM_ABI Value *addDiffRuntimeChecks(
Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
- function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);
+ ElementCount VF, unsigned IC,
+ function_ref<bool(unsigned)> UsesLoopDependenceMaskForAccessSize);
/// Struct to hold information about a partially invariant condition.
struct IVConditionInfo {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index eae645ab84fff..f7535c44cf315 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -508,11 +508,16 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
}
}
+ // Find the minimum common alignment of all accesses.
+ Align AccessAlign = getLoadStoreAlignment(SrcInsts[0]);
+ for (Instruction *Inst : concat<Instruction *>(SrcInsts, SinkInsts))
+ AccessAlign = std::min(AccessAlign, getLoadStoreAlignment(Inst));
+
LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
<< "SrcStart: " << *SrcStartInt << '\n'
<< "SinkStartInt: " << *SinkStartInt << '\n');
DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
- Src->NeedsFreeze || Sink->NeedsFreeze);
+ Src->NeedsFreeze || Sink->NeedsFreeze, AccessAlign);
return true;
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d7cb2a035957..c256085e4effd 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2193,40 +2193,72 @@ struct SCEVPtrToAddrRewriter : SCEVRewriteVisitor<SCEVPtrToAddrRewriter> {
Value *llvm::addDiffRuntimeChecks(
Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
- function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
-
+ ElementCount VF, unsigned IC,
+ function_ref<bool(unsigned)> UsesLoopDependenceMaskForAccessSize) {
LLVMContext &Ctx = Loc->getContext();
IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout()));
ChkBuilder.SetInsertPoint(Loc);
+ Value *RuntimeVF = nullptr;
// Our instructions might fold to a constant.
Value *MemoryRuntimeCheck = nullptr;
-
auto &SE = *Expander.getSE();
const DataLayout &DL = Loc->getDataLayout();
SCEVPtrToAddrRewriter Rewriter(SE, DL);
// Map to keep track of created compares, The key is the pair of operands for
// the compare, to allow detecting and re-using redundant compares.
DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
- for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
+ for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze, AccessAlign] :
+ Checks) {
+ Value *IsConflict;
Type *Ty = SinkStart->getType();
- // Compute VF * IC * AccessSize.
- auto *VFTimesICTimesSize =
- ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
- ConstantInt::get(Ty, IC * AccessSize));
+ Type *CheckTy = ChkBuilder.getIntNTy(Ty->getScalarSizeInBits());
const SCEV *SinkStartRewritten = Rewriter.visit(SinkStart);
const SCEV *SrcStartRewritten = Rewriter.visit(SrcStart);
Value *Diff = Expander.expandCodeFor(
SE.getMinusSCEV(SinkStartRewritten, SrcStartRewritten), Ty, Loc);
- // Check if the same compare has already been created earlier. In that case,
- // there is no need to check it again.
- Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
- if (IsConflict)
- continue;
+ VectorType *MaskTy = VectorType::get(ChkBuilder.getInt1Ty(), VF * IC);
+ if (!UsesLoopDependenceMaskForAccessSize(AccessSize) ||
+ commonAlignment(AccessAlign, AccessSize) < AccessSize) {
+ // Compute VF * IC * AccessSize.
+ if (!RuntimeVF)
+ RuntimeVF = ChkBuilder.CreateElementCount(CheckTy, VF);
+
+ auto *VFTimesICTimesSize = ChkBuilder.CreateMul(
+ RuntimeVF, ConstantInt::get(Ty, IC * AccessSize));
+ // Check if the same compare has already been created earlier. In that
+ // case, there is no need to check it again.
+ IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
+ if (IsConflict)
+ continue;
- IsConflict =
- ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
- SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+ IsConflict =
+ ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
+ SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+ } else {
+ Value *LoopAccessSize = ChkBuilder.getInt64(AccessSize);
+ IsConflict = SeenCompares.lookup({Diff, LoopAccessSize});
+ if (IsConflict)
+ continue;
+
+ // Note: This creates loop.dependence.war.mask(ptr null, ptr %diff). This
+ // allows SCEV to remove common offsets and avoids creating duplicate
+ // checks. If %diff is a sub, it can be folded into the mask.
+ Value *SrcPtr = ConstantPointerNull::get(PointerType::getUnqual(Ctx));
+ Value *SinkPtr = ChkBuilder.CreateIntToPtr(Diff, ChkBuilder.getPtrTy());
+ Value *Mask = ChkBuilder.CreateIntrinsic(
+ MaskTy, Intrinsic::loop_dependence_war_mask,
+ {SrcPtr, SinkPtr, LoopAccessSize}, {}, "loop.dep.mask");
+
+ Value *LastLaneIdx = ChkBuilder.CreateSub(
+ ChkBuilder.CreateElementCount(CheckTy, MaskTy->getElementCount()),
+ ChkBuilder.getIntN(Ty->getScalarSizeInBits(), 1));
+ Value *NoConflict =
+ ChkBuilder.CreateExtractElement(Mask, LastLaneIdx, "no.conflict");
+
+ IsConflict = ChkBuilder.CreateNot(NoConflict, "is.conflict");
+ SeenCompares.insert({{Diff, LoopAccessSize}, IsConflict});
+ }
if (NeedsFreeze)
IsConflict =
ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 164d74f1a440d..36d979babf649 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1887,15 +1887,13 @@ class GeneratedRTChecks {
auto DiffChecks = RtPtrChecking.getDiffChecks();
if (DiffChecks) {
- Value *RuntimeVF = nullptr;
- MemRuntimeCheckCond = addDiffRuntimeChecks(
- MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
- [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
- if (!RuntimeVF)
- RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
- return RuntimeVF;
- },
- IC);
+ LLVMContext &Ctx = MemCheckBlock->getContext();
+ auto UseLoopDependenceMask = [&](unsigned AccessSize) {
+ return isLoopDependenceMaskCheap(Ctx, VF, IC, AccessSize);
+ };
+ MemRuntimeCheckCond =
+ addDiffRuntimeChecks(MemCheckBlock->getTerminator(), *DiffChecks,
+ MemCheckExp, VF, IC, UseLoopDependenceMask);
} else {
MemRuntimeCheckCond = addRuntimeChecks(
MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
@@ -1947,6 +1945,21 @@ class GeneratedRTChecks {
OuterLoop = L->getParentLoop();
}
+ bool isLoopDependenceMaskCheap(LLVMContext &Ctx, ElementCount VF, unsigned IC,
+ unsigned AccessSize) {
+ if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+ return ForceTargetInstructionCost <= 1;
+ VectorType *MaskTy = VectorType::get(Type::getInt1Ty(Ctx), VF * IC);
+ Value *AccessSizeVal = ConstantInt::get(Type::getInt64Ty(Ctx), AccessSize);
+ Value *NullPtr = ConstantPointerNull::get(PointerType::getUnqual(Ctx));
+ // The pointer values should not change the cost. The access size (constant)
+ // is needed to by targets to cost the mask.
+ IntrinsicCostAttributes ICA(Intrinsic::loop_dependence_war_mask, MaskTy,
+ {NullPtr, NullPtr, AccessSizeVal});
+ InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+ return Cost.isValid() && Cost <= 1;
+ }
+
InstructionCost getCost() {
if (SCEVCheckBlock || MemCheckBlock)
LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
diff --git a/llvm/test/Analysis/CostModel/loop-dep-mask-no_info.ll b/llvm/test/Analysis/CostModel/loop-dep-mask-no_info.ll
new file mode 100644
index 0000000000000..7b326c7b4a805
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/loop-dep-mask-no_info.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT
+
+define void @loop_dependence_war_mask(ptr %a, ptr %b) {
+; CHECK-SIZE-LABEL: 'loop_dependence_war_mask'
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; CHECK-THROUGHPUT-LABEL: 'loop_dependence_war_mask'
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+ %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+ %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+ %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+ ret void
+}
+
+define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
+; CHECK-SIZE-LABEL: 'loop_dependence_raw_mask'
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; CHECK-THROUGHPUT-LABEL: 'loop_dependence_raw_mask'
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+ %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+ %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+ %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 0000000000000..f3b47a6121d61
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,347 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: [[LOOP_DEP_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr null, ptr [[TMP1]], i64 1)
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP5]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP19]], 1
+; CHECK-NEXT: [[NO_CONFLICT:%.*]] = extractelement <vscale x 16 x i1> [[LOOP_DEP_MASK]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP8]], 4
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP24]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP14]]
+; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP17]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP12]], ptr align 1 [[TMP16]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP26:%.*]] = xor i1 [[TMP23]], true
+; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %add = add i8 %load.b, %load.a
+ %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+ store i8 %add, ptr %gep.c, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body, %entry
+ ret void
+}
+
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[C2]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP13]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP9]], 2
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP27]], 2
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP21]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT: [[TMP25]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP26:%.*]] = xor i1 [[TMP28]], true
+; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+ %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+ %load.a = load i32, ptr %gep.a, align 2
+ %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+ store i32 %load.a, ptr %gep.c, align 2
+ %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+ %load.b = load i32, ptr %gep.b, align 2
+ %add = add i32 %load.a, %accum
+ %add2 = add i32 %add, %load.b
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %entry, %for.body
+ ret i32 %add2
+}
+
+define void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define void @alias_mask_multiple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64
+; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[C1]], [[A2]]
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: [[LOOP_DEP_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr null, ptr [[TMP1]], i64 1)
+; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP32:%.*]] = mul nuw i64 [[TMP31]], 16
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP32]], 1
+; CHECK-NEXT: [[NO_CONFLICT:%.*]] = extractelement <vscale x 16 x i1> [[LOOP_DEP_MASK]], i64 [[TMP2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[C1]], [[B3]]
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[LOOP_DEP_MASK4:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr null, ptr [[TMP8]], i64 1)
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP6]], 16
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT: [[NO_CONFLICT5:%.*]] = extractelement <vscale x 16 x i1> [[LOOP_DEP_MASK4]], i64 [[TMP5]]
+; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = xor i1 [[NO_CONFLICT5]], true
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP19]], 4
+; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP18]], 4
+; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
+; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP25]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP26]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP27:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr align 1 [[TMP28]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP30:%.*]] = xor i1 [[TMP29]], true
+; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %add = add i8 %load.b, %load.a
+ %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+ store i8 %add, ptr %gep.c, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body, %entry
+ ret void
+}
+
+define i32 @alias_mask_multiple_read_after_write(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_multiple_read_after_write(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64
+; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[C1]], [[A2]]
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: [[LOOP_DEP_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr null, ptr [[TMP1]], i64 4)
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP6]], 4
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT: [[NO_CONFLICT:%.*]] = extractelement <vscale x 4 x i1> [[LOOP_DEP_MASK]], i64 [[TMP2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[B3]], [[C1]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT: [[LOOP_DEP_MASK4:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr null, ptr [[TMP10]], i64 4)
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP8]], 1
+; CHECK-NEXT: [[NO_CONFLICT5:%.*]] = extractelement <vscale x 4 x i1> [[LOOP_DEP_MASK4]], i64 [[TMP5]]
+; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = xor i1 [[NO_CONFLICT5]], true
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP19]], 2
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP24]], 2
+; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
+; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP25]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP26]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP27]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP28:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP29:%.*]] = add <vscale x 4 x i32> [[TMP28]], [[WIDE_MASKED_LOAD14]]
+; CHECK-NEXT: [[TMP30]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP29]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP32:%.*]] = xor i1 [[TMP31]], true
+; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+ %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+ %load.a = load i32, ptr %gep.a, align 4
+ %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+ store i32 %load.a, ptr %gep.c, align 4
+ %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+ %load.b = load i32, ptr %gep.b, align 4
+ %add = add i32 %load.a, %accum
+ %add2 = add i32 %add, %load.b
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %entry, %for.body
+ ret i32 %add2
+}
+
+; Negative test: We can't use `loop.dependence.war.mask` with unaligned pointers.
+define void @misaligned_alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define void @misaligned_alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP13]], ptr align 2 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+ %load.a = load i32, ptr %gep.a, align 2
+ %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+ %load.b = load i32, ptr %gep.b, align 2
+ %add = add i32 %load.b, %load.a
+ %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+ store i32 %add, ptr %gep.c, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body, %entry
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 1ef9b60f63bef..1bcece0e53e35 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1263,9 +1263,9 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
+; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[C1]], [[A2]]
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[C1]], [[A2]]
; DEFAULT-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; DEFAULT-NEXT: [[TMP7:%.*]] = sub i64 [[C1]], [[B3]]
; DEFAULT-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP7]], [[TMP5]]
@@ -1317,9 +1317,9 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; PRED: [[VECTOR_MEMCHECK]]:
+; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[A2]]
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
-; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[A2]]
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; PRED-NEXT: [[TMP4:%.*]] = sub i64 [[C1]], [[B3]]
; PRED-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP4]], [[TMP2]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 8bbd3d6942cc7..9de29f6ace613 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -17,10 +17,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
+; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
; DEFAULT-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; DEFAULT: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
@@ -123,9 +123,9 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; PRED: [[VECTOR_MEMCHECK]]:
+; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
-; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; PRED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 6cd94d9907597..d3b42e73caa78 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -132,17 +132,17 @@ define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, pt
; CHECK-NEXT: [[OUT_B1:%.*]] = ptrtoaddr ptr [[OUT_B]] to i64
; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[OUT_B1]], [[OUT_A2]]
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[OUT_B1]], [[OUT_A2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 4
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[OUT_A2]], [[IN3]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 4
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP1]], 4
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[OUT_B1]], [[IN3]]
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP1]], 4
; CHECK-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP7]], [[TMP6]]
; CHECK-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
index 7d807277d9853..db17b80761e31 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
@@ -20,10 +20,10 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[D1]], [[S2]]
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[D1]], [[S2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
index ea2c092c49960..8a19d15a6af04 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
@@ -20,10 +20,10 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 {
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
@@ -88,10 +88,10 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 {
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index bd4ea480c80c7..17794c5bbedb5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -19,25 +19,25 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[DST_21]], [[DST_12]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[DST_21]], [[DST_12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[DST_12]], [[SRC_13]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[DST_12]], [[SRC_25]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[DIFF_CHECK6:%.*]] = icmp ult i64 [[TMP10]], [[TMP9]]
; CHECK-NEXT: [[CONFLICT_RDX7:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK6]]
-; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[DST_21]], [[SRC_13]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP12]], [[TMP11]]
; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX7]], [[DIFF_CHECK8]]
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[DST_21]], [[SRC_25]]
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[DIFF_CHECK10:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
; CHECK-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX9]], [[DIFF_CHECK10]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index f7ca39b8c920e..4a71bbc8fc8c4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -90,9 +90,9 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 7
-; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index ee3144549bcbb..800b2db31dd88 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -307,7 +307,7 @@ define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N
; CHECK-NEXT: store float [[INC]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: for.exit:
; CHECK-NEXT: ret void
;
@@ -350,7 +350,7 @@ define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %ind
; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP14]]
; CHECK: for.exit:
; CHECK-NEXT: ret void
;
@@ -406,7 +406,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP21]], i32 1, <vscale x 4 x i1> splat (i1 true))
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -459,7 +459,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 {
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP11]], i32 1, <vscale x 4 x i1> splat (i1 true))
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -513,7 +513,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> splat (i1 true))
; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -564,7 +564,7 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK: for.exit:
@@ -604,11 +604,14 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
; CHECK: vector.memcheck:
; CHECK-NEXT: [[ARRAY1:%.*]] = ptrtoaddr ptr [[ARRAY]] to i64
; CHECK-NEXT: [[INDICES2:%.*]] = ptrtoaddr ptr [[INDICES]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[LOOP_DEP_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr null, ptr [[TMP10]], i64 4)
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]]
-; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
-; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP5]], -1
+; CHECK-NEXT: [[NO_CONFLICT:%.*]] = extractelement <vscale x 4 x i1> [[LOOP_DEP_MASK]], i64 [[TMP4]]
+; CHECK-NEXT: br i1 [[NO_CONFLICT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
@@ -632,7 +635,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -725,7 +728,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> splat (i1 true))
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
index 723d4f16e289e..1526c23dc1a55 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
@@ -16,13 +16,13 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -77,13 +77,13 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; ZVFHMIN: [[VECTOR_MEMCHECK]]:
+; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; ZVFHMIN-NEXT: [[TMP8:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; ZVFHMIN-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; ZVFHMIN-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; ZVFHMIN-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -161,13 +161,13 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -222,13 +222,13 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; ZVFHMIN: [[VECTOR_MEMCHECK]]:
+; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; ZVFHMIN-NEXT: [[TMP8:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; ZVFHMIN-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; ZVFHMIN-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; ZVFHMIN-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -306,13 +306,13 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 2
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -367,13 +367,13 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; ZVFHMIN: [[VECTOR_MEMCHECK]]:
+; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; ZVFHMIN-NEXT: [[TMP8:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; ZVFHMIN-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; ZVFHMIN-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; ZVFHMIN-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -451,13 +451,13 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 2
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -512,13 +512,13 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; ZVFHMIN: [[VECTOR_MEMCHECK]]:
+; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; ZVFHMIN-NEXT: [[TMP8:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8
; ZVFHMIN-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; ZVFHMIN-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; ZVFHMIN-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -592,13 +592,13 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoaddr ptr [[OUTPUT]] to i64
; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 2
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP16]], 2
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP16]], 2
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], [[TMP19]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -647,13 +647,13 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoaddr ptr [[OUTPUT]] to i64
; ZVFHMIN-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; ZVFHMIN: [[VECTOR_MEMCHECK]]:
+; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 2
; ZVFHMIN-NEXT: [[TMP8:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 2
; ZVFHMIN-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; ZVFHMIN-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; ZVFHMIN-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -725,13 +725,13 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoaddr ptr [[OUTPUT]] to i64
; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 2
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP16]], 2
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP16]], 2
; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], [[TMP19]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -780,13 +780,13 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoaddr ptr [[OUTPUT]] to i64
; ZVFHMIN-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; ZVFHMIN: [[VECTOR_MEMCHECK]]:
+; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; ZVFHMIN-NEXT: [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
; ZVFHMIN-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 2
; ZVFHMIN-NEXT: [[TMP8:%.*]] = sub i64 [[OUTPUT1]], [[INPUT23]]
+; ZVFHMIN-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 2
; ZVFHMIN-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; ZVFHMIN-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; ZVFHMIN-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 33c96c5eb21b2..b9f1373d31c96 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -197,10 +197,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[TMP9]]
; RV64-NEXT: br i1 [[TMP8]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; RV64: [[VECTOR_MEMCHECK]]:
+; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; RV64: [[VECTOR_PH]]:
@@ -260,10 +260,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; RV32-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; RV32: [[VECTOR_MEMCHECK]]:
+; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4
-; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; RV32: [[VECTOR_PH]]:
@@ -331,10 +331,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP7]], [[TMP9]]
; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
; RV64-UF2: [[VECTOR_MEMCHECK]]:
+; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
-; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; RV64-UF2: [[VECTOR_PH]]:
@@ -448,10 +448,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[TMP9]]
; RV64-NEXT: br i1 [[TMP8]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; RV64: [[VECTOR_MEMCHECK]]:
+; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; RV64: [[VECTOR_PH]]:
@@ -511,10 +511,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; RV32-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; RV32: [[VECTOR_MEMCHECK]]:
+; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4
-; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; RV32: [[VECTOR_PH]]:
@@ -582,10 +582,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP7]], [[TMP9]]
; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
; RV64-UF2: [[VECTOR_MEMCHECK]]:
+; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
-; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; RV64-UF2: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 85d9a11cb65e0..1aa4371664ca8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -611,10 +611,10 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE:%.*]], 1
; NOSTRIDED-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; NOSTRIDED: vector.memcheck:
+; NOSTRIDED-NEXT: [[TMP6:%.*]] = sub i64 [[P21]], [[P3]]
; NOSTRIDED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NOSTRIDED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NOSTRIDED-NEXT: [[TMP6:%.*]] = sub i64 [[P21]], [[P3]]
; NOSTRIDED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NOSTRIDED-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; NOSTRIDED: vector.ph:
@@ -665,10 +665,10 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-UF2-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE:%.*]], 1
; NOSTRIDED-UF2-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
; NOSTRIDED-UF2: vector.memcheck:
+; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = sub i64 [[P21]], [[P3]]
; NOSTRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; NOSTRIDED-UF2-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
-; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = sub i64 [[P21]], [[P3]]
; NOSTRIDED-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
; NOSTRIDED-UF2-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; NOSTRIDED-UF2: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll
index 06d4c24459945..5177a34186494 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll
@@ -16,9 +16,9 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -65,9 +65,9 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -131,9 +131,9 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -180,9 +180,9 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -246,9 +246,9 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -295,9 +295,9 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -361,9 +361,9 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -410,9 +410,9 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -476,9 +476,9 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -525,9 +525,9 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -591,9 +591,9 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -640,9 +640,9 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -706,9 +706,9 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -755,9 +755,9 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -821,9 +821,9 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -870,9 +870,9 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -936,9 +936,9 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -985,9 +985,9 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1051,9 +1051,9 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1100,9 +1100,9 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1166,9 +1166,9 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1215,9 +1215,9 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1281,9 +1281,9 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1330,9 +1330,9 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1396,9 +1396,9 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1445,9 +1445,9 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1513,10 +1513,10 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1563,10 +1563,10 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1630,10 +1630,10 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1680,10 +1680,10 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1747,10 +1747,10 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1797,10 +1797,10 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1864,10 +1864,10 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1914,10 +1914,10 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -2034,10 +2034,10 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
; IF-EVL-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -2084,10 +2084,10 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll
index 62a200d17c8a2..f4b6bff6f706b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll
@@ -16,13 +16,13 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]]
; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -75,13 +75,13 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; NO-VP-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; NO-VP-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -154,13 +154,13 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]]
; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -213,13 +213,13 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; NO-VP-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; NO-VP-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -292,13 +292,13 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]]
; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -351,13 +351,13 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; NO-VP-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; NO-VP-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -430,13 +430,13 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4
; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]]
; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -489,13 +489,13 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 4
; NO-VP-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; NO-VP-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; NO-VP-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
@@ -568,10 +568,10 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP21]], [[TMP20]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -618,10 +618,10 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -686,10 +686,10 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP6]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -736,10 +736,10 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -804,10 +804,10 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -858,10 +858,10 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -932,10 +932,10 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -986,10 +986,10 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1060,10 +1060,10 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP5]], 4
-; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP21]], [[TMP20]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1110,10 +1110,10 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
index b891aea634f1c..a7d798ecc115f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
@@ -600,10 +600,10 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -650,10 +650,10 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -717,10 +717,10 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -767,10 +767,10 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -834,10 +834,10 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -884,10 +884,10 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -951,10 +951,10 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1001,10 +1001,10 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
@@ -1068,10 +1068,10 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64
; IF-EVL-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IF-EVL: [[VECTOR_PH]]:
@@ -1118,10 +1118,10 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; NO-VP: [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
; NO-VP-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
; NO-VP-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; NO-VP: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll
index c64aee7f35fb1..6a558e75eeee2 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -o - -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-target-instruction-cost=1 -force-vector-interleave=1 -o - -S %s | FileCheck %s --check-prefix=CHECK-LOOP-DEP
; Test case with a large number of pointer groups to check for memory
; conflicts, but with many redundant checks that can be simplified.
@@ -149,6 +150,184 @@ define void @test_large_number_of_group(ptr %dst, i64 %off, i64 %N) {
; CHECK: exit:
; CHECK-NEXT: ret void
;
+; CHECK-LOOP-DEP-LABEL: @test_large_number_of_group(
+; CHECK-LOOP-DEP-NEXT: entry:
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_2:%.*]] = shl i64 [[OFF:%.*]], 1
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_3:%.*]] = mul i64 [[OFF]], 3
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_4:%.*]] = shl i64 [[OFF]], 2
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_5:%.*]] = mul i64 [[OFF]], 5
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_6:%.*]] = mul i64 [[OFF]], 6
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_7:%.*]] = mul i64 [[OFF]], 7
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_8:%.*]] = shl i64 [[OFF]], 3
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_9:%.*]] = mul i64 [[OFF]], 9
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_10:%.*]] = mul i64 [[OFF]], 10
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_11:%.*]] = mul i64 [[OFF]], 11
+; CHECK-LOOP-DEP-NEXT: [[OFF_MUL_12:%.*]] = mul i64 [[OFF]], 12
+; CHECK-LOOP-DEP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-LOOP-DEP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-LOOP-DEP: vector.memcheck:
+; CHECK-LOOP-DEP-NEXT: [[TMP0:%.*]] = inttoptr i64 [[OFF_MUL_8]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK1:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP0]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT1:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK1]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT1:%.*]] = xor i1 [[NO_CONFLICT1]], true
+; CHECK-LOOP-DEP-NEXT: [[TMP2:%.*]] = shl i64 [[OFF]], 4
+; CHECK-LOOP-DEP-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP3]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX1:%.*]] = or i1 [[IS_CONFLICT1]], [[IS_CONFLICT]]
+; CHECK-LOOP-DEP-NEXT: [[TMP7:%.*]] = mul i64 [[OFF]], 24
+; CHECK-LOOP-DEP-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP4]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT3:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK2]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT4:%.*]] = xor i1 [[NO_CONFLICT3]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[CONFLICT_RDX1]], [[IS_CONFLICT4]]
+; CHECK-LOOP-DEP-NEXT: [[TMP12:%.*]] = shl i64 [[OFF]], 5
+; CHECK-LOOP-DEP-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK5:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP6]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT6:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK5]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT7:%.*]] = xor i1 [[NO_CONFLICT6]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX8:%.*]] = or i1 [[CONFLICT_RDX]], [[IS_CONFLICT7]]
+; CHECK-LOOP-DEP-NEXT: [[TMP17:%.*]] = mul i64 [[OFF]], 40
+; CHECK-LOOP-DEP-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK9:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP8]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT10:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK9]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT11:%.*]] = xor i1 [[NO_CONFLICT10]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX12:%.*]] = or i1 [[CONFLICT_RDX8]], [[IS_CONFLICT11]]
+; CHECK-LOOP-DEP-NEXT: [[TMP22:%.*]] = mul i64 [[OFF]], 48
+; CHECK-LOOP-DEP-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK13:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP10]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT14:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK13]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT15:%.*]] = xor i1 [[NO_CONFLICT14]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX12]], [[IS_CONFLICT15]]
+; CHECK-LOOP-DEP-NEXT: [[TMP27:%.*]] = mul i64 [[OFF]], 56
+; CHECK-LOOP-DEP-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK17:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP13]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT18:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK17]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT19:%.*]] = xor i1 [[NO_CONFLICT18]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX20:%.*]] = or i1 [[CONFLICT_RDX16]], [[IS_CONFLICT19]]
+; CHECK-LOOP-DEP-NEXT: [[TMP32:%.*]] = shl i64 [[OFF]], 6
+; CHECK-LOOP-DEP-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK21:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP14]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT22:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK21]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT23:%.*]] = xor i1 [[NO_CONFLICT22]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX24:%.*]] = or i1 [[CONFLICT_RDX20]], [[IS_CONFLICT23]]
+; CHECK-LOOP-DEP-NEXT: [[TMP37:%.*]] = mul i64 [[OFF]], 72
+; CHECK-LOOP-DEP-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP37]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK25:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP16]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT26:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK25]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT27:%.*]] = xor i1 [[NO_CONFLICT26]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX28:%.*]] = or i1 [[CONFLICT_RDX24]], [[IS_CONFLICT27]]
+; CHECK-LOOP-DEP-NEXT: [[TMP42:%.*]] = mul i64 [[OFF]], 80
+; CHECK-LOOP-DEP-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP42]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK29:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP18]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT30:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK29]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT31:%.*]] = xor i1 [[NO_CONFLICT30]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX32:%.*]] = or i1 [[CONFLICT_RDX28]], [[IS_CONFLICT31]]
+; CHECK-LOOP-DEP-NEXT: [[TMP47:%.*]] = mul i64 [[OFF]], 88
+; CHECK-LOOP-DEP-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP47]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK257:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP20]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT258:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK257]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT259:%.*]] = xor i1 [[NO_CONFLICT258]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX260:%.*]] = or i1 [[CONFLICT_RDX32]], [[IS_CONFLICT259]]
+; CHECK-LOOP-DEP-NEXT: br i1 [[CONFLICT_RDX260]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-LOOP-DEP: vector.ph:
+; CHECK-LOOP-DEP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-LOOP-DEP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-LOOP-DEP-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LOOP-DEP: vector.body:
+; CHECK-LOOP-DEP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-LOOP-DEP-NEXT: [[TMP167:%.*]] = add nsw i64 [[INDEX]], -5
+; CHECK-LOOP-DEP-NEXT: [[TMP168:%.*]] = add i64 [[TMP167]], [[OFF]]
+; CHECK-LOOP-DEP-NEXT: [[TMP169:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[TMP168]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP169]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP170:%.*]] = add i64 [[TMP167]], [[OFF_MUL_2]]
+; CHECK-LOOP-DEP-NEXT: [[TMP171:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP170]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP171]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP172:%.*]] = add i64 [[TMP167]], [[OFF_MUL_3]]
+; CHECK-LOOP-DEP-NEXT: [[TMP173:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP172]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP173]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP174:%.*]] = add i64 [[TMP167]], [[OFF_MUL_4]]
+; CHECK-LOOP-DEP-NEXT: [[TMP175:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP174]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP175]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP176:%.*]] = add i64 [[TMP167]], [[OFF_MUL_5]]
+; CHECK-LOOP-DEP-NEXT: [[TMP177:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP176]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP177]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP178:%.*]] = add i64 [[TMP167]], [[OFF_MUL_6]]
+; CHECK-LOOP-DEP-NEXT: [[TMP179:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP178]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP179]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP180:%.*]] = add i64 [[TMP167]], [[OFF_MUL_7]]
+; CHECK-LOOP-DEP-NEXT: [[TMP181:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP180]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP181]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP182:%.*]] = add i64 [[TMP167]], [[OFF_MUL_8]]
+; CHECK-LOOP-DEP-NEXT: [[TMP183:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP182]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP183]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP184:%.*]] = add i64 [[TMP167]], [[OFF_MUL_9]]
+; CHECK-LOOP-DEP-NEXT: [[TMP185:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP184]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP185]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP186:%.*]] = add i64 [[TMP167]], [[OFF_MUL_10]]
+; CHECK-LOOP-DEP-NEXT: [[TMP187:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP186]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP187]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP188:%.*]] = add i64 [[TMP167]], [[OFF_MUL_11]]
+; CHECK-LOOP-DEP-NEXT: [[TMP189:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP188]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP189]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP190:%.*]] = add i64 [[TMP167]], [[OFF_MUL_12]]
+; CHECK-LOOP-DEP-NEXT: [[TMP191:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP190]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> zeroinitializer, ptr [[TMP191]], align 8
+; CHECK-LOOP-DEP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-LOOP-DEP-NEXT: [[TMP192:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-LOOP-DEP-NEXT: br i1 [[TMP192]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-LOOP-DEP: middle.block:
+; CHECK-LOOP-DEP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-LOOP-DEP-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-LOOP-DEP: scalar.ph:
+; CHECK-LOOP-DEP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-LOOP-DEP-NEXT: br label [[LOOP:%.*]]
+; CHECK-LOOP-DEP: loop:
+; CHECK-LOOP-DEP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-LOOP-DEP-NEXT: [[IV_SUB_5:%.*]] = add nsw i64 [[IV]], -5
+; CHECK-LOOP-DEP-NEXT: [[IDX_1:%.*]] = add i64 [[IV_SUB_5]], [[OFF]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_1]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_2:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_2]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_2:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_2]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_2]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_3:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_3]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_3:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_3]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_3]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_4:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_4]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_4:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_4]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_4]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_5:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_5]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_5:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_5]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_5]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_6:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_6]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_6:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_6]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_6]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_7:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_7]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_7:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_7]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_7]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_8:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_8]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_8:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_8]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_8]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_9:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_9]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_9:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_9]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_9]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_10:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_10]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_10:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_10]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_10]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_11:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_11]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_11:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_11]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_11]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IDX_12:%.*]] = add i64 [[IV_SUB_5]], [[OFF_MUL_12]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_12:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IDX_12]]
+; CHECK-LOOP-DEP-NEXT: store double 0.000000e+00, ptr [[GEP_12]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-LOOP-DEP-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-LOOP-DEP-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-LOOP-DEP: exit:
+; CHECK-LOOP-DEP-NEXT: ret void
+;
entry:
%off.mul.2 = shl i64 %off, 1
%off.mul.3 = mul i64 %off, 3
@@ -259,6 +438,60 @@ define void @check_creation_order(ptr %a, ptr %b, i32 %m) {
; CHECK: exit:
; CHECK-NEXT: ret void
;
+; CHECK-LOOP-DEP-LABEL: @check_creation_order(
+; CHECK-LOOP-DEP-NEXT: entry:
+; CHECK-LOOP-DEP-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A:%.*]] to i64
+; CHECK-LOOP-DEP-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A3:%.*]] to i64
+; CHECK-LOOP-DEP-NEXT: [[M_EXT:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-LOOP-DEP-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr double, ptr [[A3]], i64 [[M_EXT]]
+; CHECK-LOOP-DEP-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-LOOP-DEP: vector.memcheck:
+; CHECK-LOOP-DEP-NEXT: [[TMP1:%.*]] = mul nsw i64 [[M_EXT]], -8
+; CHECK-LOOP-DEP-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP2]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-LOOP-DEP-NEXT: [[TMP8:%.*]] = sub i64 [[A2]], [[A1]]
+; CHECK-LOOP-DEP-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-LOOP-DEP-NEXT: [[LOOP_DEP_MASK3:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr null, ptr [[TMP9]], i64 8)
+; CHECK-LOOP-DEP-NEXT: [[NO_CONFLICT4:%.*]] = extractelement <4 x i1> [[LOOP_DEP_MASK3]], i64 3
+; CHECK-LOOP-DEP-NEXT: [[IS_CONFLICT5:%.*]] = xor i1 [[NO_CONFLICT4]], true
+; CHECK-LOOP-DEP-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[IS_CONFLICT]], [[IS_CONFLICT5]]
+; CHECK-LOOP-DEP-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LOOP-DEP: vector.ph:
+; CHECK-LOOP-DEP-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LOOP-DEP: vector.body:
+; CHECK-LOOP-DEP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-LOOP-DEP-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[INVARIANT_GEP]], i64 [[INDEX]]
+; CHECK-LOOP-DEP-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-LOOP-DEP-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
+; CHECK-LOOP-DEP-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[WIDE_LOAD]], [[WIDE_LOAD6]]
+; CHECK-LOOP-DEP-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A3]], i64 [[INDEX]]
+; CHECK-LOOP-DEP-NEXT: store <4 x double> [[TMP5]], ptr [[TMP6]], align 8
+; CHECK-LOOP-DEP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-LOOP-DEP-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 31996
+; CHECK-LOOP-DEP-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-LOOP-DEP: middle.block:
+; CHECK-LOOP-DEP-NEXT: br label [[SCALAR_PH]]
+; CHECK-LOOP-DEP: scalar.ph:
+; CHECK-LOOP-DEP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 31996, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-LOOP-DEP-NEXT: br label [[LOOP:%.*]]
+; CHECK-LOOP-DEP: loop:
+; CHECK-LOOP-DEP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-LOOP-DEP-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[INVARIANT_GEP]], i64 [[IV]]
+; CHECK-LOOP-DEP-NEXT: [[L_0:%.*]] = load double, ptr [[GEP]], align 8
+; CHECK-LOOP-DEP-NEXT: [[GEP_B:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
+; CHECK-LOOP-DEP-NEXT: [[L_1:%.*]] = load double, ptr [[GEP_B]], align 8
+; CHECK-LOOP-DEP-NEXT: [[ADD3:%.*]] = fadd double [[L_0]], [[L_1]]
+; CHECK-LOOP-DEP-NEXT: [[GEP_A:%.*]] = getelementptr inbounds double, ptr [[A3]], i64 [[IV]]
+; CHECK-LOOP-DEP-NEXT: store double [[ADD3]], ptr [[GEP_A]], align 8
+; CHECK-LOOP-DEP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-LOOP-DEP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 31999
+; CHECK-LOOP-DEP-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-LOOP-DEP: exit:
+; CHECK-LOOP-DEP-NEXT: ret void
+;
entry:
%m.ext = sext i32 %m to i64
%invariant.gep = getelementptr double, ptr %a, i64 %m.ext
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 645c2742095bb..b25f3767b4a82 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -24,11 +24,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
; CHECK-VF4UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK-VF4UF1: [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT: [[TMP8:%.*]] = add i64 [[B1]], -4
+; CHECK-VF4UF1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
; CHECK-VF4UF1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; CHECK-VF4UF1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; CHECK-VF4UF1-NEXT: [[TMP8:%.*]] = add i64 [[B1]], -4
-; CHECK-VF4UF1-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
; CHECK-VF4UF1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]]
; CHECK-VF4UF1-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK-VF4UF1: [[VECTOR_PH]]:
@@ -87,11 +87,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
; CHECK-VF4UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK-VF4UF2: [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT: [[TMP8:%.*]] = add i64 [[B1]], -4
+; CHECK-VF4UF2-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
; CHECK-VF4UF2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF4UF2-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; CHECK-VF4UF2-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; CHECK-VF4UF2-NEXT: [[TMP8:%.*]] = add i64 [[B1]], -4
-; CHECK-VF4UF2-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
; CHECK-VF4UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]]
; CHECK-VF4UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK-VF4UF2: [[VECTOR_PH]]:
More information about the llvm-commits
mailing list