[llvm] [LoopVectorizer] Allow partial reductions to be made in predicated loops (PR #124268)
James Chesterman via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 31 06:22:35 PST 2025
https://github.com/JamesChesterman updated https://github.com/llvm/llvm-project/pull/124268
>From e5c27907e0b864bb093e9f16d4e33e3b98eafde4 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 24 Jan 2025 13:12:09 +0000
Subject: [PATCH 1/5] [LoopVectorizer] Allow partial reductions to be made in
predicated loops
Does a select on the input rather than the output. This way the
mask has the same number of lanes as the other operand in the
select instruction.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 17 ++++------
llvm/lib/Transforms/Vectorize/VPlan.h | 23 ++++++++++---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 ++++
.../partial-reduce-dot-product-neon.ll | 24 +++++++-------
.../AArch64/partial-reduce-dot-product.ll | 32 +++++++++----------
5 files changed, 58 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5375d2be9c8751..5b61d9fa7d3aea 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8734,13 +8734,6 @@ bool VPRecipeBuilder::getScaledReductions(
if (!CM.TheLoop->contains(RdxExitInstr))
return false;
- // TODO: Allow scaling reductions when predicating. The select at
- // the end of the loop chooses between the phi value and most recent
- // reduction result, both of which have different VFs to the active lane
- // mask when scaling.
- if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
- return false;
-
auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
if (!Update)
return false;
@@ -8904,8 +8897,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
- return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
- Accumulator, Reduction);
+ VPValue *Mask = getBlockInMask(Reduction->getParent());
+
+ return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Accumulator, Mask, Reduction);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9726,8 +9720,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
PhiTy->isFloatingPointTy()
? std::make_optional(RdxDesc.getFastMathFlags())
: std::nullopt;
- NewExitingVPV =
- Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
+ if (!isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe()))
+ NewExitingVPV =
+ Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
return isa<VPInstruction>(&U) &&
cast<VPInstruction>(&U)->getOpcode() ==
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a1ff684b2b8017..3056d7b16db09c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2448,16 +2448,18 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// A recipe for forming partial reductions. In the loop, an accumulator and
/// vector operand are added together and passed to the next iteration as the
/// next accumulator. After the loop body, the accumulator is reduced to a
-/// scalar value.
+/// scalar value. If the mask operand is not nullptr then it is applied to the
+/// vector operand on each iteration.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1)
- : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
+ VPValue *Op1, VPValue *Mask = nullptr)
+ : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Mask,
ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+ VPValue *Mask = nullptr,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
@@ -2467,12 +2469,23 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
"Unexpected operand order for partial reduction recipe");
+ if (Mask)
+ addOperand(Mask);
}
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getUnderlyingInstr());
+ return getNumOperands() == 3
+ ? new VPPartialReductionRecipe(Opcode, getOperand(0),
+ getOperand(1), getOperand(2),
+ getUnderlyingInstr())
+ : new VPPartialReductionRecipe(Opcode, getOperand(0),
+ getOperand(1), nullptr,
+ getUnderlyingInstr());
+ }
+
+ VPValue *getMask() const {
+ return getNumOperands() == 3 ? getOperand(2) : nullptr;
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2679ed6b26b5d1..8de6dd29f5d8d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -327,6 +327,12 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
Type *RetTy = PhiVal->getType();
+ VPValue *Mask = getMask();
+ if (Mask) {
+ Value *MaskVal = State.get(Mask);
+ Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
+ BinOpVal = Builder.CreateSelect(MaskVal, BinOpVal, Zero);
+ }
CallInst *V = Builder.CreateIntrinsic(
RetTy, Intrinsic::experimental_vector_partial_reduce_add,
{PhiVal, BinOpVal}, nullptr, "partial.reduce");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 97a5801d88108a..cc9f00374e2c89 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -1596,7 +1596,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1905,14 +1905,14 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -1951,7 +1951,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2260,14 +2260,14 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
-; CHECK-INTERLEAVED-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]])
+; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -2306,7 +2306,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2615,14 +2615,14 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
-; CHECK-MAXBW-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]])
+; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a0214ae88c2d6e..a2a2d231c81d11 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1975,41 +1975,41 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
-; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
-; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT: [[TMP19]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[TMP21]], i32 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
>From b62c3fef6c46d5d2bb49f2b18cacc52c0e55312e Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 24 Jan 2025 14:11:31 +0000
Subject: [PATCH 2/5] Move condition that checks whether current reduction is a
a partial reduction.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5b61d9fa7d3aea..6b57ba9eaa85ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9711,7 +9711,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// beginning of the dedicated latch block.
auto *OrigExitingVPV = PhiR->getBackedgeValue();
auto *NewExitingVPV = PhiR->getBackedgeValue();
- if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
+ if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
+ !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
@@ -9720,9 +9721,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
PhiTy->isFloatingPointTy()
? std::make_optional(RdxDesc.getFastMathFlags())
: std::nullopt;
- if (!isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe()))
- NewExitingVPV =
- Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
+ NewExitingVPV =
+ Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
return isa<VPInstruction>(&U) &&
cast<VPInstruction>(&U)->getOpcode() ==
>From 0e0f8a1f7975422b452493d6878c3c8cac4ec335 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 24 Jan 2025 16:37:57 +0000
Subject: [PATCH 3/5] Address comments, such as removing and renaming tests
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 +
llvm/lib/Transforms/Vectorize/VPlan.h | 9 +-
.../partial-reduce-dot-product-neon.ll | 497 ++++++------------
.../AArch64/partial-reduce-dot-product.ll | 449 ++++------------
4 files changed, 274 insertions(+), 686 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6b57ba9eaa85ec..dd930c3a5a3b49 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9711,6 +9711,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// beginning of the dedicated latch block.
auto *OrigExitingVPV = PhiR->getBackedgeValue();
auto *NewExitingVPV = PhiR->getBackedgeValue();
+ // Don't add selects here for partial reductions because the phi and partial
+ // reduction values have less vector elements than Cond. But, each operand
+ // in a select instruction needs to have the same number of vector elements,
+ // so the compiler would crash. Instead, a select, with the active lane
+ // mask, is applied to the inputs to the partial reduction.
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
!isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3056d7b16db09c..4f074beff1b249 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2475,13 +2475,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- return getNumOperands() == 3
- ? new VPPartialReductionRecipe(Opcode, getOperand(0),
- getOperand(1), getOperand(2),
- getUnderlyingInstr())
- : new VPPartialReductionRecipe(Opcode, getOperand(0),
- getOperand(1), nullptr,
- getUnderlyingInstr());
+ return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
+ getMask(), getUnderlyingInstr());
}
VPValue *getMask() const {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index cc9f00374e2c89..54dc6ce5d3a92f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -1391,197 +1391,8 @@ exit: ; preds = %for.body
ret i32 %result
}
-define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: vector.body:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1: scalar.ph:
-; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: for.body:
-; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-INTERLEAVE1: exit:
-; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED: vector.body:
-; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVED: scalar.ph:
-; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVED: for.body:
-; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-INTERLEAVED: exit:
-; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT: entry:
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW: vector.body:
-; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-MAXBW: scalar.ph:
-; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-MAXBW: for.body:
-; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
-; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
-; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
-; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-MAXBW: exit:
-; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %entry, %for.body
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
- %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
- %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
- %load.a = load i8, ptr %gep.a, align 1
- %ext.a = sext i8 %load.a to i32
- %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
- %load.b = load i8, ptr %gep.b, align 1
- %ext.b = sext i8 %load.b to i32
- %mul = mul nsw i32 %ext.b, %ext.a
- %add = add nsw i32 %mul, %accum
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, %N
- br i1 %exitcond.not, label %exit, label %for.body
-
-exit: ; preds = %for.body
- ret i32 %add
-}
-
-define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_predicated(
; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1617,7 +1428,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK-INTERLEAVE1: pred.load.if:
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE]]
@@ -1626,7 +1437,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
; CHECK-INTERLEAVE1: pred.load.if1:
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE2]]
@@ -1635,7 +1446,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
; CHECK-INTERLEAVE1: pred.load.if3:
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE4]]
@@ -1644,7 +1455,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
; CHECK-INTERLEAVE1: pred.load.if5:
-; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE6]]
@@ -1653,7 +1464,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
; CHECK-INTERLEAVE1: pred.load.if7:
-; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE8]]
@@ -1662,7 +1473,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
; CHECK-INTERLEAVE1: pred.load.if9:
-; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE10]]
@@ -1671,7 +1482,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
; CHECK-INTERLEAVE1: pred.load.if11:
-; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE12]]
@@ -1680,7 +1491,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
; CHECK-INTERLEAVE1: pred.load.if13:
-; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE14]]
@@ -1689,7 +1500,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
; CHECK-INTERLEAVE1: pred.load.if15:
-; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE16]]
@@ -1698,7 +1509,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
; CHECK-INTERLEAVE1: pred.load.if17:
-; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE18]]
@@ -1707,7 +1518,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
; CHECK-INTERLEAVE1: pred.load.if19:
-; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE20]]
@@ -1716,7 +1527,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
; CHECK-INTERLEAVE1: pred.load.if21:
-; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE22]]
@@ -1725,7 +1536,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
; CHECK-INTERLEAVE1: pred.load.if23:
-; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE24]]
@@ -1734,7 +1545,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
; CHECK-INTERLEAVE1: pred.load.if25:
-; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE26]]
@@ -1743,7 +1554,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
; CHECK-INTERLEAVE1: pred.load.if27:
-; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE28]]
@@ -1752,7 +1563,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
; CHECK-INTERLEAVE1: pred.load.if29:
-; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE30]]
@@ -1762,7 +1573,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
; CHECK-INTERLEAVE1: pred.load.if31:
-; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE32]]
@@ -1771,7 +1582,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
; CHECK-INTERLEAVE1: pred.load.if33:
-; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE34]]
@@ -1780,7 +1591,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
; CHECK-INTERLEAVE1: pred.load.if35:
-; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE36]]
@@ -1789,7 +1600,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
; CHECK-INTERLEAVE1: pred.load.if37:
-; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE38]]
@@ -1798,7 +1609,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
; CHECK-INTERLEAVE1: pred.load.if39:
-; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE40]]
@@ -1807,7 +1618,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
; CHECK-INTERLEAVE1: pred.load.if41:
-; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE42]]
@@ -1816,7 +1627,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
; CHECK-INTERLEAVE1: pred.load.if43:
-; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE44]]
@@ -1825,7 +1636,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
; CHECK-INTERLEAVE1: pred.load.if45:
-; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE46]]
@@ -1834,7 +1645,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
; CHECK-INTERLEAVE1: pred.load.if47:
-; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE48]]
@@ -1843,7 +1654,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
; CHECK-INTERLEAVE1: pred.load.if49:
-; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE50]]
@@ -1852,7 +1663,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
; CHECK-INTERLEAVE1: pred.load.if51:
-; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE52]]
@@ -1861,7 +1672,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
; CHECK-INTERLEAVE1: pred.load.if53:
-; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE54]]
@@ -1870,7 +1681,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
; CHECK-INTERLEAVE1: pred.load.if55:
-; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
; CHECK-INTERLEAVE1-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE56]]
@@ -1879,7 +1690,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
; CHECK-INTERLEAVE1: pred.load.if57:
-; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
; CHECK-INTERLEAVE1-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE58]]
@@ -1888,7 +1699,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
; CHECK-INTERLEAVE1: pred.load.if59:
-; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
; CHECK-INTERLEAVE1-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE60]]
@@ -1897,7 +1708,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
; CHECK-INTERLEAVE1: pred.load.if61:
-; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE62]]
@@ -1909,34 +1720,34 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
-; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP182]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-INTERLEAVE1: for.body:
; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVE1: exit:
-; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP182]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]]
;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated(
; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1972,7 +1783,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK-INTERLEAVED: pred.load.if:
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE]]
@@ -1981,7 +1792,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
; CHECK-INTERLEAVED: pred.load.if1:
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE2]]
@@ -1990,7 +1801,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
; CHECK-INTERLEAVED: pred.load.if3:
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE4]]
@@ -1999,7 +1810,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
; CHECK-INTERLEAVED: pred.load.if5:
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE6]]
@@ -2008,7 +1819,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
; CHECK-INTERLEAVED: pred.load.if7:
-; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE8]]
@@ -2017,7 +1828,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
; CHECK-INTERLEAVED: pred.load.if9:
-; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE10]]
@@ -2026,7 +1837,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
; CHECK-INTERLEAVED: pred.load.if11:
-; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE12]]
@@ -2035,7 +1846,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
; CHECK-INTERLEAVED: pred.load.if13:
-; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE14]]
@@ -2044,7 +1855,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
; CHECK-INTERLEAVED: pred.load.if15:
-; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE16]]
@@ -2053,7 +1864,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
; CHECK-INTERLEAVED: pred.load.if17:
-; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE18]]
@@ -2062,7 +1873,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
; CHECK-INTERLEAVED: pred.load.if19:
-; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE20]]
@@ -2071,7 +1882,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
; CHECK-INTERLEAVED: pred.load.if21:
-; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE22]]
@@ -2080,7 +1891,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
; CHECK-INTERLEAVED: pred.load.if23:
-; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE24]]
@@ -2089,7 +1900,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
; CHECK-INTERLEAVED: pred.load.if25:
-; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE26]]
@@ -2098,7 +1909,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
; CHECK-INTERLEAVED: pred.load.if27:
-; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE28]]
@@ -2107,7 +1918,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
; CHECK-INTERLEAVED: pred.load.if29:
-; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE30]]
@@ -2117,7 +1928,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
; CHECK-INTERLEAVED: pred.load.if31:
-; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE32]]
@@ -2126,7 +1937,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
; CHECK-INTERLEAVED: pred.load.if33:
-; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE34]]
@@ -2135,7 +1946,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
; CHECK-INTERLEAVED: pred.load.if35:
-; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE36]]
@@ -2144,7 +1955,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
; CHECK-INTERLEAVED: pred.load.if37:
-; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE38]]
@@ -2153,7 +1964,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
; CHECK-INTERLEAVED: pred.load.if39:
-; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE40]]
@@ -2162,7 +1973,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
; CHECK-INTERLEAVED: pred.load.if41:
-; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE42]]
@@ -2171,7 +1982,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
; CHECK-INTERLEAVED: pred.load.if43:
-; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE44]]
@@ -2180,7 +1991,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
; CHECK-INTERLEAVED: pred.load.if45:
-; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE46]]
@@ -2189,7 +2000,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
; CHECK-INTERLEAVED: pred.load.if47:
-; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE48]]
@@ -2198,7 +2009,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
; CHECK-INTERLEAVED: pred.load.if49:
-; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE50]]
@@ -2207,7 +2018,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
; CHECK-INTERLEAVED: pred.load.if51:
-; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE52]]
@@ -2216,7 +2027,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
; CHECK-INTERLEAVED: pred.load.if53:
-; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE54]]
@@ -2225,7 +2036,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
; CHECK-INTERLEAVED: pred.load.if55:
-; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
; CHECK-INTERLEAVED-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE56]]
@@ -2234,7 +2045,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
; CHECK-INTERLEAVED: pred.load.if57:
-; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE58]]
@@ -2243,7 +2054,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
; CHECK-INTERLEAVED: pred.load.if59:
-; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
; CHECK-INTERLEAVED-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE60]]
@@ -2252,7 +2063,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
; CHECK-INTERLEAVED: pred.load.if61:
-; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
; CHECK-INTERLEAVED-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE62]]
@@ -2264,34 +2075,34 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
-; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP182]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-INTERLEAVED: for.body:
; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVED: exit:
-; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP182]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]]
;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-LABEL: define i32 @dotp_predicated(
; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -2327,7 +2138,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK-MAXBW: pred.load.if:
-; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE]]
@@ -2336,7 +2147,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
; CHECK-MAXBW: pred.load.if1:
-; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE2]]
@@ -2345,7 +2156,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
; CHECK-MAXBW-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
; CHECK-MAXBW: pred.load.if3:
-; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE4]]
@@ -2354,7 +2165,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
; CHECK-MAXBW: pred.load.if5:
-; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE6]]
@@ -2363,7 +2174,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
; CHECK-MAXBW: pred.load.if7:
-; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE8]]
@@ -2372,7 +2183,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
; CHECK-MAXBW-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
; CHECK-MAXBW: pred.load.if9:
-; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE10]]
@@ -2381,7 +2192,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
; CHECK-MAXBW-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
; CHECK-MAXBW: pred.load.if11:
-; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE12]]
@@ -2390,7 +2201,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
; CHECK-MAXBW-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
; CHECK-MAXBW: pred.load.if13:
-; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE14]]
@@ -2399,7 +2210,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
; CHECK-MAXBW-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
; CHECK-MAXBW: pred.load.if15:
-; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE16]]
@@ -2408,7 +2219,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
; CHECK-MAXBW-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
; CHECK-MAXBW: pred.load.if17:
-; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE18]]
@@ -2417,7 +2228,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
; CHECK-MAXBW-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
; CHECK-MAXBW: pred.load.if19:
-; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE20]]
@@ -2426,7 +2237,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
; CHECK-MAXBW-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
; CHECK-MAXBW: pred.load.if21:
-; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
; CHECK-MAXBW-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE22]]
@@ -2435,7 +2246,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
; CHECK-MAXBW-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
; CHECK-MAXBW: pred.load.if23:
-; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
; CHECK-MAXBW-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
; CHECK-MAXBW-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE24]]
@@ -2444,7 +2255,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
; CHECK-MAXBW-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
; CHECK-MAXBW: pred.load.if25:
-; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
; CHECK-MAXBW-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
; CHECK-MAXBW-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE26]]
@@ -2453,7 +2264,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
; CHECK-MAXBW-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
; CHECK-MAXBW: pred.load.if27:
-; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
; CHECK-MAXBW-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
; CHECK-MAXBW-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE28]]
@@ -2462,7 +2273,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
; CHECK-MAXBW: pred.load.if29:
-; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
; CHECK-MAXBW-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
; CHECK-MAXBW-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE30]]
@@ -2472,7 +2283,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-MAXBW-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
; CHECK-MAXBW: pred.load.if31:
-; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE32]]
@@ -2481,7 +2292,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
; CHECK-MAXBW-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
; CHECK-MAXBW: pred.load.if33:
-; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE34]]
@@ -2490,7 +2301,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
; CHECK-MAXBW-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
; CHECK-MAXBW: pred.load.if35:
-; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE36]]
@@ -2499,7 +2310,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
; CHECK-MAXBW-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
; CHECK-MAXBW: pred.load.if37:
-; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE38]]
@@ -2508,7 +2319,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
; CHECK-MAXBW-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
; CHECK-MAXBW: pred.load.if39:
-; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE40]]
@@ -2517,7 +2328,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
; CHECK-MAXBW-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
; CHECK-MAXBW: pred.load.if41:
-; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE42]]
@@ -2526,7 +2337,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
; CHECK-MAXBW-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
; CHECK-MAXBW: pred.load.if43:
-; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE44]]
@@ -2535,7 +2346,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
; CHECK-MAXBW-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
; CHECK-MAXBW: pred.load.if45:
-; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE46]]
@@ -2544,7 +2355,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
; CHECK-MAXBW-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
; CHECK-MAXBW: pred.load.if47:
-; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE48]]
@@ -2553,7 +2364,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
; CHECK-MAXBW-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
; CHECK-MAXBW: pred.load.if49:
-; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE50]]
@@ -2562,7 +2373,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
; CHECK-MAXBW-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
; CHECK-MAXBW: pred.load.if51:
-; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE52]]
@@ -2571,7 +2382,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
; CHECK-MAXBW-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
; CHECK-MAXBW: pred.load.if53:
-; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
; CHECK-MAXBW-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE54]]
@@ -2580,7 +2391,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
; CHECK-MAXBW-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
; CHECK-MAXBW: pred.load.if55:
-; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
; CHECK-MAXBW-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
; CHECK-MAXBW-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE56]]
@@ -2589,7 +2400,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
; CHECK-MAXBW-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
; CHECK-MAXBW: pred.load.if57:
-; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
; CHECK-MAXBW-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
; CHECK-MAXBW-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE58]]
@@ -2598,7 +2409,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
; CHECK-MAXBW-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
; CHECK-MAXBW: pred.load.if59:
-; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
; CHECK-MAXBW-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
; CHECK-MAXBW-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE60]]
@@ -2607,7 +2418,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
; CHECK-MAXBW-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
; CHECK-MAXBW: pred.load.if61:
-; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
; CHECK-MAXBW-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
; CHECK-MAXBW-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE62]]
@@ -2619,31 +2430,31 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
-; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP182]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-MAXBW: for.body:
; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-MAXBW: exit:
-; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP182]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
entry:
@@ -2652,11 +2463,11 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
- %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
%load.a = load i8, ptr %gep.a, align 1
%ext.a = sext i8 %load.a to i32
- %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
- %load.b = load i8, ptr %gep.a2, align 1
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
%ext.b = sext i8 %load.b to i32
%mul = mul nsw i32 %ext.b, %ext.a
%add = add nsw i32 %mul, %accum
@@ -2691,7 +2502,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
@@ -2713,7 +2524,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: for.exit:
; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -2751,7 +2562,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -2774,7 +2585,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: for.exit:
; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
@@ -2803,7 +2614,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
@@ -2825,7 +2636,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-MAXBW: for.exit:
; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -2872,13 +2683,11 @@ for.exit: ; preds = %for.body
; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
;.
; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -2892,13 +2701,11 @@ for.exit: ; preds = %for.body
; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
;.
; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -2912,11 +2719,9 @@ for.exit: ; preds = %for.body
; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a2a2d231c81d11..32833231d98bc5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1625,259 +1625,48 @@ exit: ; preds = %for.body
ret i32 %result
}
-define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: vector.body:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
-; CHECK-INTERLEAVE1: scalar.ph:
-; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: for.body:
-; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-INTERLEAVE1: exit:
-; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 8
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP15]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED: vector.body:
-; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 4 x i32> [[TMP20]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
-; CHECK-INTERLEAVED: scalar.ph:
-; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVED: for.body:
-; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-INTERLEAVED: exit:
-; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT: entry:
-; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW: vector.body:
-; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-MAXBW: scalar.ph:
-; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-MAXBW: for.body:
-; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
-; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
-; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
-; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-MAXBW: exit:
-; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %entry, %for.body
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
- %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
- %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
- %load.a = load i8, ptr %gep.a, align 1
- %ext.a = sext i8 %load.a to i32
- %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
- %load.b = load i8, ptr %gep.b, align 1
- %ext.b = sext i8 %load.b to i32
- %mul = mul nsw i32 %ext.b, %ext.a
- %add = add nsw i32 %mul, %accum
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, %N
- br i1 %exitcond.not, label %exit, label %for.body
-
-exit: ; preds = %for.body
- ret i32 %add
-}
-
-define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_predicated(
; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
-; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -1888,62 +1677,62 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1: for.body:
; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVE1: exit:
; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]]
;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated(
; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
-; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -1954,22 +1743,22 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED: for.body:
; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVED: exit:
; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]]
;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-LABEL: define i32 @dotp_predicated(
; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1992,47 +1781,47 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT: [[TMP19]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[TMP21]], i32 0
-; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[TMP19]], i32 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-MAXBW: for.body:
; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
-; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-MAXBW: exit:
-; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
entry:
@@ -2041,11 +1830,11 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
- %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
%load.a = load i8, ptr %gep.a, align 1
%ext.a = sext i8 %load.a to i32
- %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
- %load.b = load i8, ptr %gep.a2, align 1
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
%ext.b = sext i8 %load.b to i32
%mul = mul nsw i32 %ext.b, %ext.a
%add = add nsw i32 %mul, %accum
@@ -2088,7 +1877,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -2114,7 +1903,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: for.exit:
; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
@@ -2164,7 +1953,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
@@ -2191,7 +1980,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: for.exit:
; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
@@ -2228,7 +2017,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -2254,7 +2043,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-MAXBW: for.exit:
; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
@@ -2317,7 +2106,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
@@ -2339,7 +2128,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]]
; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
-; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-INTERLEAVE1: exit:
; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]]
@@ -2389,7 +2178,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
@@ -2412,7 +2201,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]]
; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-INTERLEAVED: exit:
; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]]
@@ -2449,7 +2238,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
@@ -2471,7 +2260,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]]
; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
-; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-MAXBW: exit:
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]]
@@ -2588,7 +2377,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2615,7 +2404,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-INTERLEAVED: for.exit:
; CHECK-INTERLEAVED-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float
@@ -2732,7 +2521,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2751,7 +2540,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-INTERLEAVE1: exit.loopexit:
; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]]
@@ -2792,7 +2581,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2812,7 +2601,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-INTERLEAVED: exit.loopexit:
; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: br label [[EXIT]]
@@ -2853,7 +2642,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2872,7 +2661,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-MAXBW: exit.loopexit:
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: br label [[EXIT]]
@@ -2933,7 +2722,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2952,7 +2741,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-INTERLEAVE1: exit.loopexit:
; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]]
@@ -2993,7 +2782,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -3013,7 +2802,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-INTERLEAVED: exit.loopexit:
; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-INTERLEAVED-NEXT: br label [[EXIT]]
@@ -3054,7 +2843,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -3073,7 +2862,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-MAXBW: exit.loopexit:
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: br label [[EXIT]]
@@ -3124,19 +2913,17 @@ attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
;.
; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -3150,21 +2937,19 @@ attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
; CHECK-INTERLEAVED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]}
; CHECK-INTERLEAVED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
; CHECK-INTERLEAVED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
; CHECK-INTERLEAVED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]}
;.
; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -3178,17 +2963,15 @@ attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
; CHECK-MAXBW: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
; CHECK-MAXBW: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
;.
>From 3fdd4ef0c5bfc33ce25b9d00121b451dd54bee83 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Wed, 29 Jan 2025 11:53:46 +0000
Subject: [PATCH 4/5] Address comments. Add RUN line to test files. Change a
comment. Remove Mask as operand in Partial Reduction Recipe. Instead just
mask the input when creating the recipe.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 17 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 13 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +-
.../partial-reduce-dot-product-neon.ll | 983 ++++++++++++++
.../AArch64/partial-reduce-dot-product.ll | 1132 +++++++++++++++++
5 files changed, 2135 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dd930c3a5a3b49..333090821b915a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8898,8 +8898,13 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
std::swap(BinOp, Accumulator);
VPValue *Mask = getBlockInMask(Reduction->getParent());
-
- return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Accumulator, Mask, Reduction);
+ if (Mask) {
+ VPValue *Zero =
+ Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+ BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
+ }
+ return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
+ Accumulator, Reduction);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9711,11 +9716,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// beginning of the dedicated latch block.
auto *OrigExitingVPV = PhiR->getBackedgeValue();
auto *NewExitingVPV = PhiR->getBackedgeValue();
- // Don't add selects here for partial reductions because the phi and partial
- // reduction values have less vector elements than Cond. But, each operand
- // in a select instruction needs to have the same number of vector elements,
- // so the compiler would crash. Instead, a select, with the active lane
- // mask, is applied to the inputs to the partial reduction.
+ // Don't output selects for partial reductions because they have an output
+ // with fewer lanes than the VF. So the operands of the select would have
+ // different numbers of lanes. Partial reductions mask the input instead.
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
!isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4f074beff1b249..1f7100093d7015 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2455,11 +2455,10 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1, VPValue *Mask = nullptr)
- : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Mask,
+ VPValue *Op1)
+ : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
- VPValue *Mask = nullptr,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
@@ -2469,18 +2468,12 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
"Unexpected operand order for partial reduction recipe");
- if (Mask)
- addOperand(Mask);
}
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getMask(), getUnderlyingInstr());
- }
-
- VPValue *getMask() const {
- return getNumOperands() == 3 ? getOperand(2) : nullptr;
+ getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8de6dd29f5d8d0..d03fba308879e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -283,6 +283,13 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
std::optional<unsigned> Opcode = std::nullopt;
VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
+
+ // If the partial reduction is predicated, a select will be operand 0 rather
+ // than the binary op
+ using namespace llvm::VPlanPatternMatch;
+ if (match(getOperand(0), m_Select(m_VPValue(), m_VPValue(), m_VPValue())))
+ BinOpR = BinOpR->getOperand(1)->getDefiningRecipe();
+
if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
Opcode = std::make_optional(WidenR->getOpcode());
@@ -327,12 +334,6 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
Type *RetTy = PhiVal->getType();
- VPValue *Mask = getMask();
- if (Mask) {
- Value *MaskVal = State.get(Mask);
- Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
- BinOpVal = Builder.CreateSelect(MaskVal, BinOpVal, Zero);
- }
CallInst *V = Builder.CreateIntrinsic(
RetTy, Intrinsic::experimental_vector_partial_reduce_add,
{PhiVal, BinOpVal}, nullptr, "partial.reduce");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 54dc6ce5d3a92f..8360e12b66d05b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -2,6 +2,7 @@
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=true -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-EPILOGUE
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
@@ -161,6 +162,64 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @dotp(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -580,6 +639,186 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_different_types(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-EPILOGUE-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-EPILOGUE-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-EPILOGUE-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-EPILOGUE-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-EPILOGUE-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-EPILOGUE-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-EPILOGUE-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-EPILOGUE-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-EPILOGUE-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-EPILOGUE-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-EPILOGUE-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-EPILOGUE-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-EPILOGUE-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-EPILOGUE-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-EPILOGUE-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-EPILOGUE-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-EPILOGUE-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-EPILOGUE-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-EPILOGUE-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-EPILOGUE-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-EPILOGUE-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-EPILOGUE-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-EPILOGUE-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-EPILOGUE-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-EPILOGUE-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-EPILOGUE-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-EPILOGUE-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-EPILOGUE-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-EPILOGUE-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-EPILOGUE-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-EPILOGUE-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-EPILOGUE-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-EPILOGUE-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-EPILOGUE-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-EPILOGUE-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-EPILOGUE-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-EPILOGUE-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-EPILOGUE-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-EPILOGUE-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-EPILOGUE-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-EPILOGUE-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-EPILOGUE-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-EPILOGUE-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-EPILOGUE-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-EPILOGUE-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-EPILOGUE-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-EPILOGUE-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-EPILOGUE-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-EPILOGUE-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-EPILOGUE-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-EPILOGUE-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-EPILOGUE-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-EPILOGUE-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-EPILOGUE-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-EPILOGUE-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-EPILOGUE-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP140]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP140]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -753,6 +992,56 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -923,6 +1212,55 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -1336,6 +1674,216 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
; CHECK-MAXBW-NEXT: ret i32 [[RESULT]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @dotp_unrolled(
+; CHECK-EPILOGUE-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: iter.check:
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 4
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: vector.main.loop.iter.check:
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP16]])
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP21]])
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE14]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE14]])
+; CHECK-EPILOGUE-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE11]])
+; CHECK-EPILOGUE-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE8]])
+; CHECK-EPILOGUE-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: vec.epilog.iter.check:
+; CHECK-EPILOGUE-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-EPILOGUE: vec.epilog.ph:
+; CHECK-EPILOGUE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP35]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP36]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF18:%.*]] = urem i64 [[NUM_IN]], 4
+; CHECK-EPILOGUE-NEXT: [[N_VEC19:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF18]]
+; CHECK-EPILOGUE-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX15]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX16]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX17]], i32 0
+; CHECK-EPILOGUE-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vec.epilog.vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI21:%.*]] = phi <4 x i32> [ [[TMP37]], [[VEC_EPILOG_PH]] ], [ [[TMP76:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI22:%.*]] = phi <4 x i32> [ [[TMP38]], [[VEC_EPILOG_PH]] ], [ [[TMP70:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI23:%.*]] = phi <4 x i32> [ [[TMP39]], [[VEC_EPILOG_PH]] ], [ [[TMP64:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI24:%.*]] = phi <4 x i32> [ [[TMP40]], [[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP41:%.*]] = add i64 [[INDEX20]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP41]]
+; CHECK-EPILOGUE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP41]]
+; CHECK-EPILOGUE-NEXT: [[TMP44:%.*]] = or disjoint i64 [[TMP41]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP44]]
+; CHECK-EPILOGUE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP44]]
+; CHECK-EPILOGUE-NEXT: [[TMP47:%.*]] = or disjoint i64 [[TMP41]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP47]]
+; CHECK-EPILOGUE-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP47]]
+; CHECK-EPILOGUE-NEXT: [[TMP50:%.*]] = or disjoint i64 [[TMP41]], 3
+; CHECK-EPILOGUE-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP50]]
+; CHECK-EPILOGUE-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP50]]
+; CHECK-EPILOGUE-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP42]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD25:%.*]] = load <4 x i8>, ptr [[TMP53]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP54:%.*]] = sext <4 x i8> [[WIDE_LOAD25]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP43]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD26:%.*]] = load <4 x i8>, ptr [[TMP55]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP56:%.*]] = sext <4 x i8> [[WIDE_LOAD26]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP57:%.*]] = mul nsw <4 x i32> [[TMP56]], [[TMP54]]
+; CHECK-EPILOGUE-NEXT: [[TMP58]] = add <4 x i32> [[TMP57]], [[VEC_PHI24]]
+; CHECK-EPILOGUE-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD27:%.*]] = load <4 x i8>, ptr [[TMP59]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP60:%.*]] = sext <4 x i8> [[WIDE_LOAD27]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[TMP46]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD28:%.*]] = load <4 x i8>, ptr [[TMP61]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP62:%.*]] = sext <4 x i8> [[WIDE_LOAD28]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP63:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP62]]
+; CHECK-EPILOGUE-NEXT: [[TMP64]] = add <4 x i32> [[TMP63]], [[VEC_PHI23]]
+; CHECK-EPILOGUE-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[TMP48]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD29:%.*]] = load <4 x i8>, ptr [[TMP65]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP66:%.*]] = sext <4 x i8> [[WIDE_LOAD29]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x i8>, ptr [[TMP67]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP68:%.*]] = sext <4 x i8> [[WIDE_LOAD30]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP69:%.*]] = mul nsw <4 x i32> [[TMP66]], [[TMP68]]
+; CHECK-EPILOGUE-NEXT: [[TMP70]] = add <4 x i32> [[TMP69]], [[VEC_PHI22]]
+; CHECK-EPILOGUE-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD31:%.*]] = load <4 x i8>, ptr [[TMP71]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP72:%.*]] = sext <4 x i8> [[WIDE_LOAD31]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[TMP52]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD32:%.*]] = load <4 x i8>, ptr [[TMP73]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP74:%.*]] = sext <4 x i8> [[WIDE_LOAD32]] to <4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP75:%.*]] = mul nsw <4 x i32> [[TMP72]], [[TMP74]]
+; CHECK-EPILOGUE-NEXT: [[TMP76]] = add <4 x i32> [[TMP75]], [[VEC_PHI21]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX20]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC19]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP77]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-EPILOGUE: vec.epilog.middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP78:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP76]])
+; CHECK-EPILOGUE-NEXT: [[TMP79:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP70]])
+; CHECK-EPILOGUE-NEXT: [[TMP80:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP64]])
+; CHECK-EPILOGUE-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP58]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC19]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N34]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-EPILOGUE: vec.epilog.scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX35:%.*]] = phi i32 [ [[TMP78]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX36:%.*]] = phi i32 [ [[TMP79]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX37:%.*]] = phi i32 [ [[TMP80]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP35]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX38:%.*]] = phi i32 [ [[TMP81]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP36]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX35]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX36]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX37]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX38]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2
+; CHECK-EPILOGUE-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3
+; CHECK-EPILOGUE-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]]
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ [[TMP81]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ [[TMP80]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ [[TMP79]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[TMP78]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]]
+; CHECK-EPILOGUE-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]]
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
+; CHECK-EPILOGUE-NEXT: ret i32 [[RESULT]]
+;
entry:
br label %for.body
@@ -2457,6 +3005,361 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP182]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @dotp_predicated(
+; CHECK-EPILOGUE-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-EPILOGUE: pred.load.if:
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE]]
+; CHECK-EPILOGUE: pred.load.continue:
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK-EPILOGUE: pred.load.if1:
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE2]]
+; CHECK-EPILOGUE: pred.load.continue2:
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK-EPILOGUE: pred.load.if3:
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE4]]
+; CHECK-EPILOGUE: pred.load.continue4:
+; CHECK-EPILOGUE-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; CHECK-EPILOGUE: pred.load.if5:
+; CHECK-EPILOGUE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE6]]
+; CHECK-EPILOGUE: pred.load.continue6:
+; CHECK-EPILOGUE-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; CHECK-EPILOGUE: pred.load.if7:
+; CHECK-EPILOGUE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE8]]
+; CHECK-EPILOGUE: pred.load.continue8:
+; CHECK-EPILOGUE-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; CHECK-EPILOGUE: pred.load.if9:
+; CHECK-EPILOGUE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE10]]
+; CHECK-EPILOGUE: pred.load.continue10:
+; CHECK-EPILOGUE-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK-EPILOGUE: pred.load.if11:
+; CHECK-EPILOGUE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE12]]
+; CHECK-EPILOGUE: pred.load.continue12:
+; CHECK-EPILOGUE-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; CHECK-EPILOGUE: pred.load.if13:
+; CHECK-EPILOGUE-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
+; CHECK-EPILOGUE-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE14]]
+; CHECK-EPILOGUE: pred.load.continue14:
+; CHECK-EPILOGUE-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK-EPILOGUE: pred.load.if15:
+; CHECK-EPILOGUE-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
+; CHECK-EPILOGUE-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE16]]
+; CHECK-EPILOGUE: pred.load.continue16:
+; CHECK-EPILOGUE-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
+; CHECK-EPILOGUE: pred.load.if17:
+; CHECK-EPILOGUE-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE18]]
+; CHECK-EPILOGUE: pred.load.continue18:
+; CHECK-EPILOGUE-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
+; CHECK-EPILOGUE: pred.load.if19:
+; CHECK-EPILOGUE-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE20]]
+; CHECK-EPILOGUE: pred.load.continue20:
+; CHECK-EPILOGUE-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
+; CHECK-EPILOGUE: pred.load.if21:
+; CHECK-EPILOGUE-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-EPILOGUE-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE22]]
+; CHECK-EPILOGUE: pred.load.continue22:
+; CHECK-EPILOGUE-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
+; CHECK-EPILOGUE: pred.load.if23:
+; CHECK-EPILOGUE-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE24]]
+; CHECK-EPILOGUE: pred.load.continue24:
+; CHECK-EPILOGUE-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
+; CHECK-EPILOGUE: pred.load.if25:
+; CHECK-EPILOGUE-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE26]]
+; CHECK-EPILOGUE: pred.load.continue26:
+; CHECK-EPILOGUE-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
+; CHECK-EPILOGUE: pred.load.if27:
+; CHECK-EPILOGUE-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-EPILOGUE-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE28]]
+; CHECK-EPILOGUE: pred.load.continue28:
+; CHECK-EPILOGUE-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-EPILOGUE: pred.load.if29:
+; CHECK-EPILOGUE-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE30]]
+; CHECK-EPILOGUE: pred.load.continue30:
+; CHECK-EPILOGUE-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
+; CHECK-EPILOGUE: pred.load.if31:
+; CHECK-EPILOGUE-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE32]]
+; CHECK-EPILOGUE: pred.load.continue32:
+; CHECK-EPILOGUE-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
+; CHECK-EPILOGUE: pred.load.if33:
+; CHECK-EPILOGUE-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-EPILOGUE-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE34]]
+; CHECK-EPILOGUE: pred.load.continue34:
+; CHECK-EPILOGUE-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
+; CHECK-EPILOGUE: pred.load.if35:
+; CHECK-EPILOGUE-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE36]]
+; CHECK-EPILOGUE: pred.load.continue36:
+; CHECK-EPILOGUE-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
+; CHECK-EPILOGUE: pred.load.if37:
+; CHECK-EPILOGUE-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE38]]
+; CHECK-EPILOGUE: pred.load.continue38:
+; CHECK-EPILOGUE-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
+; CHECK-EPILOGUE: pred.load.if39:
+; CHECK-EPILOGUE-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE40]]
+; CHECK-EPILOGUE: pred.load.continue40:
+; CHECK-EPILOGUE-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
+; CHECK-EPILOGUE: pred.load.if41:
+; CHECK-EPILOGUE-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE42]]
+; CHECK-EPILOGUE: pred.load.continue42:
+; CHECK-EPILOGUE-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
+; CHECK-EPILOGUE: pred.load.if43:
+; CHECK-EPILOGUE-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE44]]
+; CHECK-EPILOGUE: pred.load.continue44:
+; CHECK-EPILOGUE-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
+; CHECK-EPILOGUE: pred.load.if45:
+; CHECK-EPILOGUE-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-EPILOGUE-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE46]]
+; CHECK-EPILOGUE: pred.load.continue46:
+; CHECK-EPILOGUE-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
+; CHECK-EPILOGUE: pred.load.if47:
+; CHECK-EPILOGUE-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-EPILOGUE-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE48]]
+; CHECK-EPILOGUE: pred.load.continue48:
+; CHECK-EPILOGUE-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
+; CHECK-EPILOGUE: pred.load.if49:
+; CHECK-EPILOGUE-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE50]]
+; CHECK-EPILOGUE: pred.load.continue50:
+; CHECK-EPILOGUE-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
+; CHECK-EPILOGUE: pred.load.if51:
+; CHECK-EPILOGUE-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE52]]
+; CHECK-EPILOGUE: pred.load.continue52:
+; CHECK-EPILOGUE-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
+; CHECK-EPILOGUE: pred.load.if53:
+; CHECK-EPILOGUE-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-EPILOGUE-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE54]]
+; CHECK-EPILOGUE: pred.load.continue54:
+; CHECK-EPILOGUE-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
+; CHECK-EPILOGUE: pred.load.if55:
+; CHECK-EPILOGUE-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE56]]
+; CHECK-EPILOGUE: pred.load.continue56:
+; CHECK-EPILOGUE-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
+; CHECK-EPILOGUE: pred.load.if57:
+; CHECK-EPILOGUE-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE58]]
+; CHECK-EPILOGUE: pred.load.continue58:
+; CHECK-EPILOGUE-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
+; CHECK-EPILOGUE: pred.load.if59:
+; CHECK-EPILOGUE-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-EPILOGUE-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE60]]
+; CHECK-EPILOGUE: pred.load.continue60:
+; CHECK-EPILOGUE-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
+; CHECK-EPILOGUE: pred.load.if61:
+; CHECK-EPILOGUE-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
+; CHECK-EPILOGUE-NEXT: br label [[PRED_LOAD_CONTINUE62]]
+; CHECK-EPILOGUE: pred.load.continue62:
+; CHECK-EPILOGUE-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
+; CHECK-EPILOGUE-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
+; CHECK-EPILOGUE-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
+; CHECK-EPILOGUE-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP182]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP182]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -2643,6 +3546,67 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
; CHECK-MAXBW-NEXT: ret i32 [[RESULT]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-EPILOGUE-NEXT: ret i32 [[RESULT]]
+;
entry:
br label %for.body
@@ -2725,3 +3689,22 @@ for.exit: ; preds = %for.body
; CHECK-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
;.
+; CHECK-EPILOGUE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-EPILOGUE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-EPILOGUE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-EPILOGUE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[META14]] = !{!"llvm.loop.mustprogress"}
+; CHECK-EPILOGUE: [[LOOP15]] = distinct !{[[LOOP15]], [[META14]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 32833231d98bc5..bb056bf41c1303 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2,6 +2,7 @@
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=true -S < %s | FileCheck %s --check-prefixes=CHECK-EPILOGUE
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
@@ -192,6 +193,77 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @dotp(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -611,6 +683,186 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_different_types(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-EPILOGUE-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-EPILOGUE-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-EPILOGUE-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-EPILOGUE-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-EPILOGUE-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-EPILOGUE-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-EPILOGUE-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-EPILOGUE-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-EPILOGUE-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-EPILOGUE-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-EPILOGUE-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-EPILOGUE-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-EPILOGUE-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-EPILOGUE-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-EPILOGUE-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-EPILOGUE-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-EPILOGUE-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-EPILOGUE-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-EPILOGUE-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-EPILOGUE-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-EPILOGUE-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-EPILOGUE-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-EPILOGUE-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-EPILOGUE-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-EPILOGUE-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-EPILOGUE-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-EPILOGUE-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-EPILOGUE-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-EPILOGUE-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-EPILOGUE-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-EPILOGUE-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-EPILOGUE-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-EPILOGUE-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-EPILOGUE-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-EPILOGUE-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-EPILOGUE-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-EPILOGUE-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-EPILOGUE-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-EPILOGUE-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-EPILOGUE-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-EPILOGUE-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-EPILOGUE-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-EPILOGUE-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-EPILOGUE-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-EPILOGUE-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-EPILOGUE-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-EPILOGUE-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-EPILOGUE-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-EPILOGUE-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-EPILOGUE-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-EPILOGUE-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-EPILOGUE-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-EPILOGUE-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-EPILOGUE-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-EPILOGUE-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-EPILOGUE-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-EPILOGUE-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP140]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP140]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -852,6 +1104,86 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
+; CHECK-EPILOGUE-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP34:%.*]] = mul i32 [[TMP33]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 1
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP35]]
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -1083,6 +1415,78 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
+; CHECK-EPILOGUE-NEXT: [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP25]]
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 8
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1
+; CHECK-EPILOGUE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP29]]
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -1570,6 +1974,198 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
; CHECK-MAXBW-NEXT: ret i32 [[RESULT]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @dotp_unrolled(
+; CHECK-EPILOGUE-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP20]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP26]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP22]]
+; CHECK-EPILOGUE-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP29]], [[TMP23]]
+; CHECK-EPILOGUE-NEXT: [[TMP32]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
+; CHECK-EPILOGUE-NEXT: [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
+; CHECK-EPILOGUE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP36]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP34]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP42]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP40]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
+; CHECK-EPILOGUE-NEXT: [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
+; CHECK-EPILOGUE-NEXT: [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
+; CHECK-EPILOGUE-NEXT: [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
+; CHECK-EPILOGUE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 [[TMP52]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP50]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP58]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP56]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
+; CHECK-EPILOGUE-NEXT: [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
+; CHECK-EPILOGUE-NEXT: [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
+; CHECK-EPILOGUE-NEXT: [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
+; CHECK-EPILOGUE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 [[TMP68]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP66]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 [[TMP74]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
+; CHECK-EPILOGUE-NEXT: [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
+; CHECK-EPILOGUE-NEXT: [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP82:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP82]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]]
+; CHECK-EPILOGUE-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX23:%.*]] = add <vscale x 4 x i32> [[TMP65]], [[TMP64]]
+; CHECK-EPILOGUE-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX23]])
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX24:%.*]] = add <vscale x 4 x i32> [[TMP49]], [[TMP48]]
+; CHECK-EPILOGUE-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX24]])
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX25:%.*]] = add <vscale x 4 x i32> [[TMP33]], [[TMP32]]
+; CHECK-EPILOGUE-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX25]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP83]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX26:%.*]] = phi i32 [ [[TMP84]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX27:%.*]] = phi i32 [ [[TMP85]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX28:%.*]] = phi i32 [ [[TMP86]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX26]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX27]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX28]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2
+; CHECK-EPILOGUE-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3
+; CHECK-EPILOGUE-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]]
+; CHECK-EPILOGUE-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
+; CHECK-EPILOGUE-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]]
+; CHECK-EPILOGUE-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]]
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP83]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]]
+; CHECK-EPILOGUE-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]]
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
+; CHECK-EPILOGUE-NEXT: ret i32 [[RESULT]]
+;
entry:
br label %for.body
@@ -1824,6 +2420,72 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @dotp_predicated(
+; CHECK-EPILOGUE-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-EPILOGUE-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-EPILOGUE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-EPILOGUE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i32 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -2050,6 +2712,83 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
; CHECK-MAXBW-NEXT: ret i32 [[RESULT]]
;
+; CHECK-EPILOGUE-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-EPILOGUE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]]
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-EPILOGUE-NEXT: ret i32 [[RESULT]]
+;
entry:
br label %for.body
@@ -2265,6 +3004,79 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]]
;
+; CHECK-EPILOGUE-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
+; CHECK-EPILOGUE-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]]
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-EPILOGUE-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i64
+; CHECK-EPILOGUE-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1
+; CHECK-EPILOGUE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]]
+; CHECK-EPILOGUE-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-EPILOGUE-NEXT: [[CONV3:%.*]] = zext i8 [[TMP29]] to i64
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]]
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: ret i64 [[ADD_LCSSA]]
+;
entry:
br label %for.body
@@ -2452,6 +3264,94 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
; CHECK-MAXBW-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4
; CHECK-MAXBW-NEXT: ret void
;
+; CHECK-EPILOGUE-LABEL: define void @not_dotp_not_phi2(
+; CHECK-EPILOGUE-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-EPILOGUE: for.preheader:
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1
+; CHECK-EPILOGUE-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-EPILOGUE-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
+; CHECK-EPILOGUE-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-EPILOGUE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-EPILOGUE-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 1
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 2
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 2
+; CHECK-EPILOGUE-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = sext i8 [[TMP10]] to i32
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = sext i8 [[TMP11]] to i32
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = mul nsw i32 [[A_EXT]], [[TMP12]]
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = mul nsw i32 [[A_EXT]], [[TMP13]]
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = add i32 [[TMP15]], [[VEC_PHI1]]
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = sext i8 [[TMP18]] to i32
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = sext i8 [[TMP19]] to i32
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP20]]
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP21]]
+; CHECK-EPILOGUE-NEXT: [[TMP24]] = add i32 [[TMP22]], [[TMP16]]
+; CHECK-EPILOGUE-NEXT: [[TMP25]] = add i32 [[TMP23]], [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP25]], [[TMP24]]
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-EPILOGUE: scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-EPILOGUE-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
+; CHECK-EPILOGUE-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2
+; CHECK-EPILOGUE-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-EPILOGUE-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-EPILOGUE-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
+; CHECK-EPILOGUE-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32
+; CHECK-EPILOGUE-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]]
+; CHECK-EPILOGUE-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]]
+; CHECK-EPILOGUE-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-EPILOGUE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-EPILOGUE: for.exit:
+; CHECK-EPILOGUE-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float
+; CHECK-EPILOGUE-NEXT: br label [[EXIT]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ]
+; CHECK-EPILOGUE-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-EPILOGUE-NEXT: ret void
+;
entry:
%cmp = icmp sgt i32 %n, 0
br i1 %cmp, label %for.preheader, label %exit
@@ -2669,6 +3569,108 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
; CHECK-MAXBW-NEXT: ret i64 [[RESULT]]
;
+; CHECK-EPILOGUE-LABEL: define i64 @not_dotp_ext_outside_plan(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: iter.check:
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: vector.main.loop.iter.check:
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP3]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP3]], i32 8
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = mul nuw nsw <8 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = mul nuw nsw <8 x i64> [[TMP7]], [[BROADCAST_SPLAT]]
+; CHECK-EPILOGUE-NEXT: [[TMP10]] = add <8 x i64> [[TMP8]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP11]] = add <8 x i64> [[TMP9]], [[VEC_PHI2]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP11]], [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: vec.epilog.iter.check:
+; CHECK-EPILOGUE-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-EPILOGUE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP15]]
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-EPILOGUE: vec.epilog.ph:
+; CHECK-EPILOGUE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[N]], [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-EPILOGUE-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vec.epilog.vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 2 x i64> [ [[TMP20]], [[VEC_EPILOG_PH]] ], [ [[TMP26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP21]]
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP22]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 2 x i16>, ptr [[TMP23]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD8]] to <vscale x 2 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[BROADCAST_SPLAT10]]
+; CHECK-EPILOGUE-NEXT: [[TMP26]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI7]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP19]]
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-EPILOGUE: vec.epilog.middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP26]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N12]], label [[EXIT_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-EPILOGUE: vec.epilog.scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i64 [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-EPILOGUE: exit.loopexit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: br label [[EXIT]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-EPILOGUE-NEXT: ret i64 [[RESULT]]
+;
entry:
%cmp = icmp eq i64 %n, 0
br i1 %cmp, label %exit, label %for.ph
@@ -2870,6 +3872,108 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
; CHECK-MAXBW-NEXT: ret i64 [[RESULT]]
;
+; CHECK-EPILOGUE-LABEL: define i64 @not_dotp_ext_outside_plan2(
+; CHECK-EPILOGUE-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-EPILOGUE-NEXT: entry:
+; CHECK-EPILOGUE-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: iter.check:
+; CHECK-EPILOGUE-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64
+; CHECK-EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: vector.main.loop.iter.check:
+; CHECK-EPILOGUE-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-EPILOGUE: vector.ph:
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP2]]
+; CHECK-EPILOGUE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP3]], i32 0
+; CHECK-EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP3]], i32 8
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP7:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP8:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; CHECK-EPILOGUE-NEXT: [[TMP9:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP7]]
+; CHECK-EPILOGUE-NEXT: [[TMP10]] = add <8 x i64> [[TMP8]], [[VEC_PHI]]
+; CHECK-EPILOGUE-NEXT: [[TMP11]] = add <8 x i64> [[TMP9]], [[VEC_PHI2]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-EPILOGUE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-EPILOGUE: middle.block:
+; CHECK-EPILOGUE-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP11]], [[TMP10]]
+; CHECK-EPILOGUE-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-EPILOGUE: vec.epilog.iter.check:
+; CHECK-EPILOGUE-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-EPILOGUE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-EPILOGUE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP15]]
+; CHECK-EPILOGUE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-EPILOGUE: vec.epilog.ph:
+; CHECK-EPILOGUE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-EPILOGUE-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[N]], [[TMP17]]
+; CHECK-EPILOGUE-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
+; CHECK-EPILOGUE-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-EPILOGUE-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; CHECK-EPILOGUE-NEXT: [[TMP20:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-EPILOGUE-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-EPILOGUE-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-EPILOGUE: vec.epilog.vector.body:
+; CHECK-EPILOGUE-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 2 x i64> [ [[TMP20]], [[VEC_EPILOG_PH]] ], [ [[TMP26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-EPILOGUE-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP21]]
+; CHECK-EPILOGUE-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP22]], i32 0
+; CHECK-EPILOGUE-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 2 x i16>, ptr [[TMP23]], align 2
+; CHECK-EPILOGUE-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD8]] to <vscale x 2 x i64>
+; CHECK-EPILOGUE-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[BROADCAST_SPLAT10]], [[TMP24]]
+; CHECK-EPILOGUE-NEXT: [[TMP26]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI7]]
+; CHECK-EPILOGUE-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP19]]
+; CHECK-EPILOGUE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
+; CHECK-EPILOGUE-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-EPILOGUE: vec.epilog.middle.block:
+; CHECK-EPILOGUE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP26]])
+; CHECK-EPILOGUE-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_N12]], label [[EXIT_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-EPILOGUE: vec.epilog.scalar.ph:
+; CHECK-EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i64 [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-EPILOGUE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-EPILOGUE: for.body:
+; CHECK-EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-EPILOGUE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
+; CHECK-EPILOGUE-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
+; CHECK-EPILOGUE-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
+; CHECK-EPILOGUE-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]]
+; CHECK-EPILOGUE-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]]
+; CHECK-EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-EPILOGUE-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-EPILOGUE-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-EPILOGUE: exit.loopexit:
+; CHECK-EPILOGUE-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-EPILOGUE-NEXT: br label [[EXIT]]
+; CHECK-EPILOGUE: exit:
+; CHECK-EPILOGUE-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-EPILOGUE-NEXT: ret i64 [[RESULT]]
+;
entry:
%cmp = icmp eq i64 %n, 0
br i1 %cmp, label %exit, label %for.ph
@@ -2975,3 +4079,31 @@ attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
; CHECK-MAXBW: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
; CHECK-MAXBW: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
;.
+; CHECK-EPILOGUE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-EPILOGUE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-EPILOGUE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-EPILOGUE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[META13]] = !{!"llvm.loop.mustprogress"}
+; CHECK-EPILOGUE: [[LOOP14]] = distinct !{[[LOOP14]], [[META13]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; CHECK-EPILOGUE: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
+; CHECK-EPILOGUE: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]}
+;.
>From ca11aed2d3bb73fdf86b9a1682d2ae595fa5d0a6 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 31 Jan 2025 14:20:06 +0000
Subject: [PATCH 5/5] Address comments on PR
Involves adding an extra if statement and changing a comment.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
llvm/lib/Transforms/Vectorize/VPlan.h | 3 +--
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 333090821b915a..075350eabe8001 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8897,8 +8897,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
- VPValue *Mask = getBlockInMask(Reduction->getParent());
- if (Mask) {
+ if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
+ VPValue *Mask = getBlockInMask(Reduction->getParent());
VPValue *Zero =
Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f7100093d7015..a1ff684b2b8017 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2448,8 +2448,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// A recipe for forming partial reductions. In the loop, an accumulator and
/// vector operand are added together and passed to the next iteration as the
/// next accumulator. After the loop body, the accumulator is reduced to a
-/// scalar value. If the mask operand is not nullptr then it is applied to the
-/// vector operand on each iteration.
+/// scalar value.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
More information about the llvm-commits
mailing list