[llvm] [AArch64] Refactor and refine cost-model for partial reductions (PR #158641)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 29 03:50:52 PDT 2025
https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/158641
>From 2fc143267867f369c3a4830c9700c54b2d4de5ec Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 12 Sep 2025 10:22:46 +0100
Subject: [PATCH 1/2] [AArch64] Refactor and refine cost-model for partial
reductions
This cost-model takes into account any type-legalisation
that would happen on vectors such as splitting and promotion.
This results in wider VFs being chosen for loops that can
use partial reductions.
The cost-model now also assumes that when SVE is available, the SVE dot
instructions for i16 -> i64 dot products can be used for fixed-length
vectors. In practice this means that loops with non-scalable VFs are
vectorized using partial reductions where they wouldn't before, e.g.
int64_t foo2(int8_t *src1, int8_t *src2, int N) {
int64_t sum = 0;
for (int i=0; i<N; ++i)
sum += (int64_t)src1[i] * (int64_t)src2[i];
return sum;
}
These changes also fix an issue where previously a partial reduction
would be used for mixed sign/zero-extends (USDOT), even when
+i8mm was not available.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 111 ++--
.../AArch64/fully-unrolled-cost.ll | 4 +-
.../AArch64/partial-reduce-chained.ll | 188 +++---
.../partial-reduce-dot-product-epilogue.ll | 20 +-
.../partial-reduce-dot-product-mixed.ll | 100 ++--
.../AArch64/partial-reduce-dot-product.ll | 562 +++++++-----------
.../AArch64/partial-reduce-sub.ll | 88 ++-
.../LoopVectorize/AArch64/partial-reduce.ll | 94 +--
.../LoopVectorize/AArch64/reg-usage.ll | 2 +-
.../LoopVectorize/AArch64/vplan-printing.ll | 4 +-
10 files changed, 519 insertions(+), 654 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8c4b4f6e4d6de..a2ee509fe4cb7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5632,75 +5632,88 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const {
InstructionCost Invalid = InstructionCost::getInvalid();
- InstructionCost Cost(TTI::TCC_Basic);
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;
- // Sub opcodes currently only occur in chained cases.
- // Independent partial reduction subtractions are still costed as an add
+ if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+ return Invalid;
+
+ if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
+ (!ST->isNeonAvailable() || !ST->hasDotProd()))
+ return Invalid;
+
if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
OpAExtend == TTI::PR_None)
return Invalid;
// We only support multiply binary operations for now, and for muls we
// require the types being extended to be the same.
- // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
- // only if the i8mm or sve/streaming features are available.
- if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
- OpBExtend == TTI::PR_None ||
- (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
- !ST->isSVEorStreamingSVEAvailable())))
+ if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
return Invalid;
assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
"Unexpected values for OpBExtend or InputTypeB");
- EVT InputEVT = EVT::getEVT(InputTypeA);
- EVT AccumEVT = EVT::getEVT(AccumType);
+ bool IsUSDot = OpBExtend && OpAExtend != OpBExtend;
+ if (IsUSDot && !ST->hasMatMulInt8())
+ return Invalid;
- unsigned VFMinValue = VF.getKnownMinValue();
+ unsigned Ratio =
+ AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+ if (VF.getKnownMinValue() < Ratio)
+ return Invalid;
- if (VF.isScalable()) {
- if (!ST->isSVEorStreamingSVEAvailable())
- return Invalid;
+ VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
+ VectorType *AccumVectorType =
+ VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
- // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
- // since we can't lower that type.
- unsigned Scale =
- AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
- if (VFMinValue == Scale)
- return Invalid;
- }
- if (VF.isFixed() &&
- (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
+ // We don't yet support widening for <vscale x 1 x ..> accumulators.
+ if (AccumVectorType->getElementCount() == ElementCount::getScalable(1))
return Invalid;
- if (InputEVT == MVT::i8) {
- switch (VFMinValue) {
- default:
- return Invalid;
- case 8:
- if (AccumEVT == MVT::i32)
- Cost *= 2;
- else if (AccumEVT != MVT::i64)
- return Invalid;
- break;
- case 16:
- if (AccumEVT == MVT::i64)
- Cost *= 2;
- else if (AccumEVT != MVT::i32)
- return Invalid;
- break;
- }
- } else if (InputEVT == MVT::i16) {
- // FIXME: Allow i32 accumulator but increase cost, as we would extend
- // it to i64.
- if (VFMinValue != 8 || AccumEVT != MVT::i64)
- return Invalid;
- } else
- return Invalid;
+ // Check what kind of type-legalisation happens.
+ std::pair<InstructionCost, MVT> AccumLT =
+ getTypeLegalizationCost(AccumVectorType);
+ std::pair<InstructionCost, MVT> InputLT =
+ getTypeLegalizationCost(InputVectorType);
- return Cost;
+ InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
+
+ // Prefer using full types by costing half-full input types as more expensive.
+ if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
+ TypeSize::getScalable(128)))
+ // FIXME: This can be removed after the cost of the extends are folded into
+ // the dot-product expression in VPlan, after landing:
+ // https://github.com/llvm/llvm-project/pull/147302
+ Cost *= 2;
+
+ if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
+ // i16 -> i64 is natively supported for udot/sdot
+ if (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i16)
+ return Cost;
+ // i8 -> i64 is supported with an extra level of extends
+ if (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i8)
+ // FIXME: This cost should probably be a little higher, e.g. Cost + 2
+ // because it requires two extra extends on the inputs. But if we'd change
+ // that now, a regular reduction would be cheaper because the costs of
+ // the extends in the IR are still counted. This can be fixed
+ // after https://github.com/llvm/llvm-project/pull/147302 has landed.
+ return Cost;
+ }
+
+ // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
+ if (ST->isSVEorStreamingSVEAvailable() ||
+ (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
+ ST->hasDotProd())) {
+ if (AccumLT.second.getScalarType() == MVT::i32 &&
+ InputLT.second.getScalarType() == MVT::i8)
+ return Cost;
+ }
+
+ // Add additional cost for the extends that would need to be inserted.
+ return Cost + 4;
}
InstructionCost
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index c3b0bc8c00a74..47f43339dc35f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 25
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 41
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index c0995ec150c8d..e40f6f3647e52 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -204,37 +204,33 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -670,39 +666,35 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -996,36 +988,32 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[TMP15]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -1140,32 +1128,28 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]]
-; CHECK-SVE-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -1277,36 +1261,32 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP18]] = add <vscale x 4 x i32> [[TMP16]], [[TMP17]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -1422,32 +1402,28 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-SVE-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
; CHECK-SVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-NEXT: [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-NEXT: [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.experimental.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-SVE-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 9d4a969b571e7..6ce92ea34548f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
@@ -98,23 +98,23 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <1 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
+; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT: [[PARTIAL_REDUCE12]] = call <1 x i32> @llvm.experimental.vector.partial.reduce.add.v1i32.v4i32(<1 x i32> [[VEC_PHI9]], <4 x i32> [[TMP13]])
; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[PARTIAL_REDUCE12]])
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
@@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: while.end.loopexit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret void
@@ -495,7 +495,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 8766d6540ed19..951ed978c8420 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -12,40 +12,40 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NOI8MM: vector.body:
; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -82,14 +82,14 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NOI8MM: middle.block:
-; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NOI8MM: scalar.ph:
@@ -123,40 +123,40 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NOI8MM: vector.body:
; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -193,14 +193,14 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NOI8MM: middle.block:
-; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NOI8MM: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 17fbbbd1d6843..077f8eee57f6b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -12,29 +12,24 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
@@ -42,43 +37,34 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP14]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP28]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
; CHECK-MAXBW-LABEL: define i32 @dotp(
@@ -139,31 +125,24 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP9]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[TMP14]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
@@ -171,45 +150,24 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[TMP22]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[TMP23]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
@@ -274,35 +232,26 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i64 [[TMP10]], 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = mul i64 [[TMP10]], 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 2 x i64> [[TMP16]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP17]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod(
@@ -310,49 +259,36 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i16>, ptr [[TMP30]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP3]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i16>, ptr [[TMP21]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD6]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP22]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add <vscale x 2 x i64> [[TMP24]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP27]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
; CHECK-MAXBW-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod(
@@ -1055,22 +991,18 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1082,38 +1014,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP41]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP35]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP30]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP23]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1121,26 +1053,22 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 3
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1152,90 +1080,74 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP56]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = shl nuw i64 [[TMP25]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
-; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = shl nuw i64 [[TMP35]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
-; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
-; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = shl nuw i64 [[TMP57]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
-; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
-; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = shl nuw i64 [[TMP67]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = shl nuw i64 [[TMP73]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
-; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
-; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]])
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]])
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX23:%.*]] = add <vscale x 4 x i32> [[TMP65]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX23]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX24:%.*]] = add <vscale x 4 x i32> [[TMP49]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX24]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX25:%.*]] = add <vscale x 4 x i32> [[TMP33]], [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX25]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2220,36 +2132,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK-INTERLEAVE1: for.body.preheader:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -2261,50 +2169,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK-INTERLEAVED: for.body.preheader:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 64cb33181cc1e..a43a160722163 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -12,30 +12,25 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
@@ -43,45 +38,36 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP21]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP22]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add <vscale x 4 x i32> [[VEC_PHI1]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP26]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
; CHECK-MAXBW-LABEL: define i32 @dotp(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 370bfc641001a..d368cf79d9885 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -17,16 +17,16 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP3]] = add <16 x i32> [[TMP2]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -38,22 +38,22 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -199,16 +199,16 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -220,22 +220,22 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -293,16 +293,16 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -314,22 +314,22 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <8 x i64> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -761,16 +761,16 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -782,22 +782,22 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -981,21 +981,21 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1012,26 +1012,26 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]])
; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 01d103264fafe..5c68db2a9518d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK: LV(REG): VF = 16
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea3d4aab..0cba88448af48 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-none-unknown-elf"
; Tests for printing VPlans that are enabled under AArch64
define i32 @print_partial_reduction(ptr %a, ptr %b) {
-; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK: VPlan 'Initial VPlan for VF={4,8,16},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF
; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
@@ -71,7 +71,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' {
+; CHECK: VPlan 'Final VPlan for VF={4,8,16},UF={1}' {
; CHECK-NEXT: Live-in ir<1024> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
>From 5a9c0a38744bf76418deea4d2e1486bea8a17df0 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 29 Sep 2025 10:33:15 +0000
Subject: [PATCH 2/2] Widening is not supported for <1 x ..> either
---
.../AArch64/AArch64TargetTransformInfo.cpp | 13 +++++++++++--
.../partial-reduce-dot-product-epilogue.ll | 18 +++++++++---------
.../LoopVectorize/AArch64/vplan-printing.ll | 4 ++--
3 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a2ee509fe4cb7..ece53cd4af217 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5667,9 +5667,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
VectorType *AccumVectorType =
VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
- // We don't yet support widening for <vscale x 1 x ..> accumulators.
- if (AccumVectorType->getElementCount() == ElementCount::getScalable(1))
+ // We don't yet support all kinds of legalization (e.g. widening
+ // of <[vscale x] 1 x ..> accumulators)
+ auto TA = TLI->getTypeAction(AccumVectorType->getContext(),
+ EVT::getEVT(AccumVectorType));
+ switch (TA) {
+ default:
return Invalid;
+ case TargetLowering::TypeLegal:
+ case TargetLowering::TypePromoteInteger:
+ case TargetLowering::TypeSplitVector:
+ break;
+ }
// Check what kind of type-legalisation happens.
std::pair<InstructionCost, MVT> AccumLT =
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 6ce92ea34548f..d3af94dd62258 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
@@ -98,23 +98,23 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <1 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32>
-; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT: [[PARTIAL_REDUCE12]] = call <1 x i32> @llvm.experimental.vector.partial.reduce.add.v1i32.v4i32(<1 x i32> [[VEC_PHI9]], <4 x i32> [[TMP13]])
+; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[PARTIAL_REDUCE12]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
@@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: while.end.loopexit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 0cba88448af48..d4e5dea3d4aab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-none-unknown-elf"
; Tests for printing VPlans that are enabled under AArch64
define i32 @print_partial_reduction(ptr %a, ptr %b) {
-; CHECK: VPlan 'Initial VPlan for VF={4,8,16},UF>=1' {
+; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF
; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
@@ -71,7 +71,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK: VPlan 'Final VPlan for VF={4,8,16},UF={1}' {
+; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' {
; CHECK-NEXT: Live-in ir<1024> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
More information about the llvm-commits
mailing list