[llvm] [LoopVectorizer] Add support for chaining partial reductions (PR #120272)
Nicholas Guy via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 22 06:31:34 PST 2025
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/120272
>From 34d5f25a026e6bdb337a1ba8e1a2cf7a8a4291d5 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 17 Dec 2024 17:07:45 +0000
Subject: [PATCH 1/6] [LoopVectorizer] Add support for chaining partial
reductions
---
.../AArch64/AArch64TargetTransformInfo.h | 2 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 55 +-
.../Transforms/Vectorize/VPRecipeBuilder.h | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 5 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 +-
.../AArch64/partial-reduction-chained.ll | 568 ++++++++++++++++++
6 files changed, 623 insertions(+), 25 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 8e7e590c173ff2..c6cebcca679353 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -368,7 +368,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost Invalid = InstructionCost::getInvalid();
InstructionCost Cost(TTI::TCC_Basic);
- if (Opcode != Instruction::Add)
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return Invalid;
if (InputTypeA != InputTypeB)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 99f6a8860f0f4d..79be3e15594c48 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8790,12 +8790,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
/// are valid so recipes can be formed later.
void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
// Find all possible partial reductions.
- SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+ SmallVector<std::pair<PartialReductionChain, unsigned>>
PartialReductionChains;
- for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
- if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
- getScaledReduction(Phi, RdxDesc, Range))
- PartialReductionChains.push_back(*Pair);
+ for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+ if (auto SR = getScaledReduction(Phi, RdxDesc.getLoopExitInstr(), Range))
+ PartialReductionChains.append(*SR);
+ }
// A partial reduction is invalid if any of its extends are used by
// something that isn't another partial reduction. This is because the
@@ -8823,26 +8823,42 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
}
}
-std::optional<std::pair<PartialReductionChain, unsigned>>
-VPRecipeBuilder::getScaledReduction(PHINode *PHI,
- const RecurrenceDescriptor &Rdx,
+std::optional<SmallVector<std::pair<PartialReductionChain, unsigned>>>
+VPRecipeBuilder::getScaledReduction(Instruction *PHI,
+ Instruction *RdxExitInstr,
VFRange &Range) {
+
+ if(!CM.TheLoop->contains(RdxExitInstr))
+ return std::nullopt;
+
// TODO: Allow scaling reductions when predicating. The select at
// the end of the loop chooses between the phi value and most recent
// reduction result, both of which have different VFs to the active lane
// mask when scaling.
- if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+ if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
return std::nullopt;
- auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+ auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
if (!Update)
return std::nullopt;
Value *Op = Update->getOperand(0);
Value *PhiOp = Update->getOperand(1);
- if (Op == PHI) {
- Op = Update->getOperand(1);
- PhiOp = Update->getOperand(0);
+ if (Op == PHI)
+ std::swap(Op, PhiOp);
+
+ SmallVector<std::pair<PartialReductionChain, unsigned>> Chains;
+
+ if (auto *OpInst = dyn_cast<Instruction>(Op)) {
+ if(auto SR0 = getScaledReduction(PHI, OpInst, Range)) {
+ Chains.append(*SR0);
+ PHI = SR0->rbegin()->first.Reduction;
+
+ Op = Update->getOperand(0);
+ PhiOp = Update->getOperand(1);
+ if (Op == PHI)
+ std::swap(Op, PhiOp);
+ }
}
if (PhiOp != PHI)
return std::nullopt;
@@ -8860,12 +8876,16 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
+ // Check that the extends extend from the same type.
+ if (A->getType() != B->getType())
+ return std::nullopt;
+
TTI::PartialReductionExtendKind OpAExtend =
TargetTransformInfo::getPartialReductionExtendKind(ExtA);
TTI::PartialReductionExtendKind OpBExtend =
TargetTransformInfo::getPartialReductionExtendKind(ExtB);
- PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+ PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
unsigned TargetScaleFactor =
PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
@@ -8880,9 +8900,9 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
return Cost.isValid();
},
Range))
- return std::make_pair(Chain, TargetScaleFactor);
+ Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
- return std::nullopt;
+ return Chains;
}
VPRecipeBase *
@@ -8979,7 +8999,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
VPValue *BinOp = Operands[0];
VPValue *Phi = Operands[1];
- if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
+ VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
+ if (isa<VPReductionPHIRecipe>(BinOpRecipe) || isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Phi);
return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index cf653e2d3e6584..6be9a716cacbfd 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -144,8 +144,8 @@ class VPRecipeBuilder {
/// Returns null if no scaled reduction was found, otherwise a pair with a
/// struct containing reduction information and the scaling factor between the
/// number of elements in the input and output.
- std::optional<std::pair<PartialReductionChain, unsigned>>
- getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+ std::optional<SmallVector<std::pair<PartialReductionChain, unsigned>>>
+ getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr,
VFRange &Range);
public:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 87f87bf1437196..b2d3d3944c1a5b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2453,13 +2453,14 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
Opcode(Opcode) {
- assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+ auto *DefiningRecipe = getOperand(1)->getDefiningRecipe();
+ assert((isa<VPReductionPHIRecipe>(DefiningRecipe) || isa<VPPartialReductionRecipe>(DefiningRecipe)) &&
"Unexpected operand order for partial reduction recipe");
}
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
+ return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 979a8e0768a991..4fb32c14398e51 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -317,13 +317,21 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
- assert(getOpcode() == Instruction::Add &&
- "Unhandled partial reduction opcode");
-
Value *BinOpVal = State.get(getOperand(0));
Value *PhiVal = State.get(getOperand(1));
assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+ auto Opcode = getOpcode();
+
+ // Currently we don't have a partial_reduce_sub intrinsic,
+ // so mimic the behaviour by negating the second operand
+ if(Opcode == Instruction::Sub) {
+ BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), BinOpVal);
+ Opcode = Instruction::Add;
+ }
+
+ assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
+
Type *RetTy = PhiVal->getType();
CallInst *V = Builder.CreateIntrinsic(
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
new file mode 100644
index 00000000000000..4272e2f7552495
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
@@ -0,0 +1,568 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -o - %s --passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
+define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[TMP33:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP31]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %add = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %sub = sub i32 %add, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP31]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB]] = add i32 [[ADD]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %add.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %add = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add.2 = add i32 %add, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[TMP31:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP29]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP31]])
+; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB]] = add i32 [[SUB1]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %add, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %add, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add i32 %sub, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[TMP31:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP29]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP31]])
+; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[TMP35:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP33]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP35]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[ADD]] = sub i32 [[SUB]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %sub.2 = sub i32 %sub, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 16 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP23:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP22]]
+; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP23]])
+; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <vscale x 16 x i32> [[TMP19]], [[TMP21]]
+; CHECK-NEXT: [[PARTIAL_REDUCE4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP24]])
+; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP26:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP25]]
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]], <vscale x 16 x i32> [[TMP26]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE5]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[E_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[E_EXT:%.*]] = sext i8 [[E_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[C_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[C_EXT]], [[E_EXT]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AB]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[D_EXT]], [[E_EXT]]
+; CHECK-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add nsw i32 %sub, %mul.ac
+ %mul.bc = mul nsw i32 %b.ext, %c.ext
+ %sub.2 = sub i32 %add, %mul.bc
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+attributes #0 = { mustprogress noinline nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-x3" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
+
+
+!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!1 = distinct !{!0}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META4]] = distinct !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
>From abe57fbe05839e527ba4c08114ef9cf32598a5b2 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 15 Jan 2025 13:49:02 +0000
Subject: [PATCH 2/6] Format
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 +++++-----
llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++++--
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +++--
3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79be3e15594c48..a18cf3c9bec2bd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8824,11 +8824,10 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
}
std::optional<SmallVector<std::pair<PartialReductionChain, unsigned>>>
-VPRecipeBuilder::getScaledReduction(Instruction *PHI,
- Instruction *RdxExitInstr,
+VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr,
VFRange &Range) {
- if(!CM.TheLoop->contains(RdxExitInstr))
+ if (!CM.TheLoop->contains(RdxExitInstr))
return std::nullopt;
// TODO: Allow scaling reductions when predicating. The select at
@@ -8850,7 +8849,7 @@ VPRecipeBuilder::getScaledReduction(Instruction *PHI,
SmallVector<std::pair<PartialReductionChain, unsigned>> Chains;
if (auto *OpInst = dyn_cast<Instruction>(Op)) {
- if(auto SR0 = getScaledReduction(PHI, OpInst, Range)) {
+ if (auto SR0 = getScaledReduction(PHI, OpInst, Range)) {
Chains.append(*SR0);
PHI = SR0->rbegin()->first.Reduction;
@@ -9000,7 +8999,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
VPValue *BinOp = Operands[0];
VPValue *Phi = Operands[1];
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
- if (isa<VPReductionPHIRecipe>(BinOpRecipe) || isa<VPPartialReductionRecipe>(BinOpRecipe))
+ if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
+ isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Phi);
return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b2d3d3944c1a5b..9e09bfc62105c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2454,13 +2454,15 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
Opcode(Opcode) {
auto *DefiningRecipe = getOperand(1)->getDefiningRecipe();
- assert((isa<VPReductionPHIRecipe>(DefiningRecipe) || isa<VPPartialReductionRecipe>(DefiningRecipe)) &&
+ assert((isa<VPReductionPHIRecipe>(DefiningRecipe) ||
+ isa<VPPartialReductionRecipe>(DefiningRecipe)) &&
"Unexpected operand order for partial reduction recipe");
}
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), getUnderlyingInstr());
+ return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
+ getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4fb32c14398e51..668c317033fe78 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -325,8 +325,9 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
// Currently we don't have a partial_reduce_sub intrinsic,
// so mimic the behaviour by negating the second operand
- if(Opcode == Instruction::Sub) {
- BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), BinOpVal);
+ if (Opcode == Instruction::Sub) {
+ BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()),
+ BinOpVal);
Opcode = Instruction::Add;
}
>From 6c4a6b0146a6562a7280e72b74958d1d77b3d858 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Mon, 20 Jan 2025 14:34:03 +0000
Subject: [PATCH 3/6] Remove partial.reduce.sub support
---
.../Target/AArch64/AArch64TargetTransformInfo.h | 2 +-
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 +++------------
2 files changed, 4 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c6cebcca679353..8e7e590c173ff2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -368,7 +368,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost Invalid = InstructionCost::getInvalid();
InstructionCost Cost(TTI::TCC_Basic);
- if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+ if (Opcode != Instruction::Add)
return Invalid;
if (InputTypeA != InputTypeB)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 668c317033fe78..979a8e0768a991 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -317,22 +317,13 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
+ assert(getOpcode() == Instruction::Add &&
+ "Unhandled partial reduction opcode");
+
Value *BinOpVal = State.get(getOperand(0));
Value *PhiVal = State.get(getOperand(1));
assert(PhiVal && BinOpVal && "Phi and Mul must be set");
- auto Opcode = getOpcode();
-
- // Currently we don't have a partial_reduce_sub intrinsic,
- // so mimic the behaviour by negating the second operand
- if (Opcode == Instruction::Sub) {
- BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()),
- BinOpVal);
- Opcode = Instruction::Add;
- }
-
- assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
-
Type *RetTy = PhiVal->getType();
CallInst *V = Builder.CreateIntrinsic(
>From 2e18209632c18e544d17e28db7d2efc2a043f329 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Mon, 20 Jan 2025 14:34:47 +0000
Subject: [PATCH 4/6] No longer set the underlying instruction in
VPPartialReductionRecipe::clone
---
llvm/lib/Transforms/Vectorize/VPlan.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9e09bfc62105c1..1550630fb819a6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2461,8 +2461,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getUnderlyingInstr());
+ return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
>From f79f577f92cec30887fd1e1de574b8cd9dcb1396 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Mon, 20 Jan 2025 14:35:07 +0000
Subject: [PATCH 5/6] Address comments and update test
---
.../Transforms/Vectorize/LoopVectorize.cpp | 7 +-
.../AArch64/partial-reduction-chained.ll | 568 ------
.../AArch64/partial-reduce-chained.ll | 1557 +++++++++++++++++
3 files changed, 1560 insertions(+), 572 deletions(-)
delete mode 100644 llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a18cf3c9bec2bd..d913954c51c007 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8848,6 +8848,9 @@ VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr,
SmallVector<std::pair<PartialReductionChain, unsigned>> Chains;
+ // Try and get a scaled reduction from the first non-phi operand.
+ // If one is found, we use the discovered reduction instruction in
+ // place of the accumulator for costing.
if (auto *OpInst = dyn_cast<Instruction>(Op)) {
if (auto SR0 = getScaledReduction(PHI, OpInst, Range)) {
Chains.append(*SR0);
@@ -8875,10 +8878,6 @@ VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr,
Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
- // Check that the extends extend from the same type.
- if (A->getType() != B->getType())
- return std::nullopt;
-
TTI::PartialReductionExtendKind OpAExtend =
TargetTransformInfo::getPartialReductionExtendKind(ExtA);
TTI::PartialReductionExtendKind OpBExtend =
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
deleted file mode 100644
index 4272e2f7552495..00000000000000
--- a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
+++ /dev/null
@@ -1,568 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -o - %s --passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
-define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
-; CHECK-LABEL: define i32 @chained_partial_reduce_add_sub(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
-; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
-; CHECK-NEXT: [[TMP33:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP31]]
-; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
-; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]]
-; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
-; CHECK-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_DB]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
-;
-entry:
- %cmp28.not = icmp ult i32 %N, 2
- %div27 = lshr i32 %N, 1
- %wide.trip.count = zext nneg i32 %div27 to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %res.0.lcssa = phi i32 [ %sub, %for.body ]
- ret i32 %res.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
- %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
- %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
- %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
- %a.val = load i8, ptr %a.ptr, align 1
- %b.val = load i8, ptr %b.ptr, align 1
- %c.val = load i8, ptr %c.ptr, align 1
- %a.ext = sext i8 %a.val to i32
- %b.ext = sext i8 %b.val to i32
- %c.ext = sext i8 %c.val to i32
- %mul.ab = mul nsw i32 %a.ext, %b.ext
- %add = add nsw i32 %res, %mul.ab
- %mul.ac = mul nsw i32 %a.ext, %c.ext
- %sub = sub i32 %add, %mul.ac
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
-}
-
-define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
-; CHECK-LABEL: define i32 @chained_partial_reduce_add_add(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
-; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
-; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP31]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
-; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]]
-; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
-; CHECK-NEXT: [[SUB]] = add i32 [[ADD]], [[MUL_DB]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
-;
-entry:
- %cmp28.not = icmp ult i32 %N, 2
- %div27 = lshr i32 %N, 1
- %wide.trip.count = zext nneg i32 %div27 to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %res.0.lcssa = phi i32 [ %add.2, %for.body ]
- ret i32 %res.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
- %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
- %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
- %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
- %a.val = load i8, ptr %a.ptr, align 1
- %b.val = load i8, ptr %b.ptr, align 1
- %c.val = load i8, ptr %c.ptr, align 1
- %a.ext = sext i8 %a.val to i32
- %b.ext = sext i8 %b.val to i32
- %c.ext = sext i8 %c.val to i32
- %mul.ab = mul nsw i32 %a.ext, %b.ext
- %add = add nsw i32 %res, %mul.ab
- %mul.ac = mul nsw i32 %a.ext, %c.ext
- %add.2 = add i32 %add, %mul.ac
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
-}
-
-define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
-; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
-; CHECK-NEXT: [[TMP31:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP29]]
-; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP31]])
-; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
-; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
-; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
-; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
-; CHECK-NEXT: [[SUB]] = add i32 [[SUB1]], [[MUL_DB]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
-;
-entry:
- %cmp28.not = icmp ult i32 %N, 2
- %div27 = lshr i32 %N, 1
- %wide.trip.count = zext nneg i32 %div27 to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %res.0.lcssa = phi i32 [ %add, %for.body ]
- ret i32 %res.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %res = phi i32 [ 0, %entry ], [ %add, %for.body ]
-
- %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
- %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
- %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
- %a.val = load i8, ptr %a.ptr, align 1
- %b.val = load i8, ptr %b.ptr, align 1
- %c.val = load i8, ptr %c.ptr, align 1
-
- %a.ext = sext i8 %a.val to i32
- %b.ext = sext i8 %b.val to i32
- %c.ext = sext i8 %c.val to i32
- %mul.ab = mul nsw i32 %a.ext, %b.ext
- %sub = sub nsw i32 %res, %mul.ab
- %mul.ac = mul nsw i32 %a.ext, %c.ext
- %add = add i32 %sub, %mul.ac
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
-}
-
-define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
-; CHECK-LABEL: define i32 @chained_partial_reduce_sub_sub(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
-; CHECK-NEXT: [[TMP31:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP29]]
-; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP31]])
-; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
-; CHECK-NEXT: [[TMP35:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP33]]
-; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP35]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
-; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
-; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
-; CHECK-NEXT: [[ADD]] = sub i32 [[SUB]], [[MUL_DB]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
-;
-entry:
- %cmp28.not = icmp ult i32 %N, 2
- %div27 = lshr i32 %N, 1
- %wide.trip.count = zext nneg i32 %div27 to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
- ret i32 %res.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
-
- %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
- %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
- %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
- %a.val = load i8, ptr %a.ptr, align 1
- %b.val = load i8, ptr %b.ptr, align 1
- %c.val = load i8, ptr %c.ptr, align 1
-
- %a.ext = sext i8 %a.val to i32
- %b.ext = sext i8 %b.val to i32
- %c.ext = sext i8 %c.val to i32
-
- %mul.ab = mul nsw i32 %a.ext, %b.ext
- %sub = sub nsw i32 %res, %mul.ab
- %mul.ac = mul nsw i32 %a.ext, %c.ext
- %sub.2 = sub i32 %sub, %mul.ac
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
-}
-
-define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
-; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
-; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 16 x i32> [[TMP19]], [[TMP20]]
-; CHECK-NEXT: [[TMP23:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP22]]
-; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP23]])
-; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <vscale x 16 x i32> [[TMP19]], [[TMP21]]
-; CHECK-NEXT: [[PARTIAL_REDUCE4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP24]])
-; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP26:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP25]]
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]], <vscale x 16 x i32> [[TMP26]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE5]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEXT: [[E_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
-; CHECK-NEXT: [[E_EXT:%.*]] = sext i8 [[E_VAL]] to i32
-; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[C_EXT]], [[D_EXT]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
-; CHECK-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[C_EXT]], [[E_EXT]]
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AB]]
-; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[D_EXT]], [[E_EXT]]
-; CHECK-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_DB]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
-;
-entry:
- %cmp28.not = icmp ult i32 %N, 2
- %div27 = lshr i32 %N, 1
- %wide.trip.count = zext nneg i32 %div27 to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
- ret i32 %res.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
-
- %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
- %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
- %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
- %a.val = load i8, ptr %a.ptr, align 1
- %b.val = load i8, ptr %b.ptr, align 1
- %c.val = load i8, ptr %c.ptr, align 1
-
- %a.ext = sext i8 %a.val to i32
- %b.ext = sext i8 %b.val to i32
- %c.ext = sext i8 %c.val to i32
-
- %mul.ab = mul nsw i32 %a.ext, %b.ext
- %sub = sub nsw i32 %res, %mul.ab
- %mul.ac = mul nsw i32 %a.ext, %c.ext
- %add = add nsw i32 %sub, %mul.ac
- %mul.bc = mul nsw i32 %b.ext, %c.ext
- %sub.2 = sub i32 %add, %mul.bc
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
-}
-
-attributes #0 = { mustprogress noinline nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-x3" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
-
-
-!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!1 = distinct !{!0}
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[META4]] = distinct !{[[META5:![0-9]+]]}
-; CHECK: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
new file mode 100644
index 00000000000000..94614f4a396317
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -0,0 +1,1557 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
+; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
+; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
+define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEON: for.cond.cleanup:
+; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-NEON: for.body:
+; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]]
+; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT: entry:
+; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE: vector.ph:
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE: vector.body:
+; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE: middle.block:
+; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE: scalar.ph:
+; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE: for.cond.cleanup:
+; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE: for.body:
+; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]]
+; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-MAXBW-NEXT: entry:
+; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW: vector.ph:
+; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: vector.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-MAXBW: middle.block:
+; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW: scalar.ph:
+; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: for.cond.cleanup:
+; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE-MAXBW: for.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]]
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %add = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %sub = sub i32 %add, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEON: for.cond.cleanup:
+; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-NEON: for.body:
+; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]]
+; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT: entry:
+; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE: vector.ph:
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE: vector.body:
+; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-SVE: middle.block:
+; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE: scalar.ph:
+; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE: for.cond.cleanup:
+; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE: for.body:
+; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]]
+; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT: entry:
+; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW: vector.ph:
+; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: vector.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-SVE-MAXBW: middle.block:
+; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
+; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW: scalar.ph:
+; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: for.cond.cleanup:
+; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE-MAXBW: for.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]]
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %add.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %add = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add.2 = add i32 %add, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEON: for.cond.cleanup:
+; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-NEON: for.body:
+; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]]
+; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT: entry:
+; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE: vector.ph:
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE: vector.body:
+; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SVE: middle.block:
+; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE: scalar.ph:
+; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE: for.cond.cleanup:
+; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE: for.body:
+; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT: entry:
+; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW: vector.ph:
+; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: vector.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SVE-MAXBW: middle.block:
+; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW: scalar.ph:
+; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: for.cond.cleanup:
+; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE-MAXBW: for.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %add, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %add, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add i32 %sub, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEON: for.cond.cleanup:
+; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-NEON: for.body:
+; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]]
+; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT: entry:
+; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE: vector.ph:
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE: vector.body:
+; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-SVE: middle.block:
+; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE: scalar.ph:
+; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE: for.cond.cleanup:
+; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE: for.body:
+; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT: entry:
+; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW: vector.ph:
+; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: vector.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-SVE-MAXBW: middle.block:
+; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW: scalar.ph:
+; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: for.cond.cleanup:
+; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE-MAXBW: for.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %sub.2 = sub i32 %sub, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEON: for.cond.cleanup:
+; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-NEON: for.body:
+; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-NEON-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
+; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]]
+; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT: entry:
+; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE: vector.ph:
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE: vector.body:
+; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-SVE: middle.block:
+; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE: scalar.ph:
+; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE: for.cond.cleanup:
+; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE: for.body:
+; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]]
+; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT: entry:
+; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW: vector.ph:
+; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: vector.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]], <vscale x 16 x i32> [[TMP18]])
+; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-SVE-MAXBW: middle.block:
+; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
+; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW: scalar.ph:
+; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: for.cond.cleanup:
+; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE-MAXBW: for.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]]
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add nsw i32 %sub, %mul.ac
+ %mul.bc = mul nsw i32 %b.ext, %c.ext
+ %sub.2 = add i32 %add, %mul.bc
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEON: for.cond.cleanup:
+; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-NEON: for.body:
+; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
+; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]]
+; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT: entry:
+; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE: vector.ph:
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE: vector.body:
+; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-NEXT: [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]]
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-SVE: middle.block:
+; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE: scalar.ph:
+; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE: for.cond.cleanup:
+; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE: for.body:
+; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]]
+; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT: entry:
+; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW: vector.ph:
+; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: vector.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = add <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP21]] = sub <vscale x 8 x i32> [[TMP19]], [[TMP20]]
+; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-SVE-MAXBW: middle.block:
+; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP21]])
+; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW: scalar.ph:
+; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-SVE-MAXBW: for.cond.cleanup:
+; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK-SVE-MAXBW: for.body:
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
+; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]]
+; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add nsw i32 %sub, %mul.ac
+ %mul.bc = mul nsw i32 %b.ext, %c.ext
+ %sub.2 = sub i32 %add, %mul.bc
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+attributes #0 = { vscale_range(1,16) }
+
+
+!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!1 = distinct !{!0}
+;.
+; CHECK-NEON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-NEON: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-NEON: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-NEON: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-NEON: [[META4]] = distinct !{[[META5:![0-9]+]]}
+; CHECK-NEON: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+; CHECK-NEON: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-NEON: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-NEON: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-NEON: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-NEON: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-NEON: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-NEON: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-NEON: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-NEON: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-NEON: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+;.
+; CHECK-SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-SVE: [[META4]] = distinct !{[[META5:![0-9]+]]}
+; CHECK-SVE: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+; CHECK-SVE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-SVE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-SVE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-SVE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-SVE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-SVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-SVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-SVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-SVE: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-SVE: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+;.
+; CHECK-SVE-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-SVE-MAXBW: [[META4]] = distinct !{[[META5:![0-9]+]]}
+; CHECK-SVE-MAXBW: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+; CHECK-SVE-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-SVE-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-SVE-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-SVE-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-SVE-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-SVE-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-SVE-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-SVE-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-SVE-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-SVE-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+;.
>From a03afa755dfaaf1d7e66369f334cf6ef929e5620 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 22 Jan 2025 14:30:12 +0000
Subject: [PATCH 6/6] Update test
---
.../AArch64/partial-reduce-chained.ll | 534 +-----------------
1 file changed, 1 insertion(+), 533 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 94614f4a396317..bedf8b6b3a9b56 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -47,32 +47,6 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NEON: for.cond.cleanup:
-; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-NEON: for.body:
-; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]]
-; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -119,32 +93,6 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE: for.cond.cleanup:
-; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE: for.body:
-; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]]
-; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -191,32 +139,6 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: for.cond.cleanup:
-; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE-MAXBW: for.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]]
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
;
entry:
%cmp28.not = icmp ult i32 %N, 2
@@ -289,32 +211,6 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NEON: for.cond.cleanup:
-; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-NEON: for.body:
-; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]]
-; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -361,32 +257,6 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE: for.cond.cleanup:
-; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE: for.body:
-; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]]
-; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -433,32 +303,6 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: for.cond.cleanup:
-; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE-MAXBW: for.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]]
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
;
entry:
%cmp28.not = icmp ult i32 %N, 2
@@ -531,32 +375,6 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NEON: for.cond.cleanup:
-; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-NEON: for.body:
-; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]]
-; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -603,32 +421,6 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE: for.cond.cleanup:
-; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE: for.body:
-; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -675,32 +467,6 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: for.cond.cleanup:
-; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE-MAXBW: for.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
;
entry:
%cmp28.not = icmp ult i32 %N, 2
@@ -775,32 +541,6 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NEON: for.cond.cleanup:
-; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-NEON: for.body:
-; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]]
-; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -847,32 +587,6 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE: for.cond.cleanup:
-; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE: for.body:
-; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -919,32 +633,6 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: for.cond.cleanup:
-; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE-MAXBW: for.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
;
entry:
%cmp28.not = icmp ult i32 %N, 2
@@ -1022,34 +710,6 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NEON: for.cond.cleanup:
-; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-NEON: for.body:
-; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-NEON-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
-; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]]
-; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -1098,34 +758,6 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE: for.cond.cleanup:
-; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE: for.body:
-; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]]
-; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -1171,37 +803,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
-; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: for.cond.cleanup:
-; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE-MAXBW: for.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]]
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
;
entry:
%cmp28.not = icmp ult i32 %N, 2
@@ -1281,34 +885,6 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NEON: for.cond.cleanup:
-; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-NEON: for.body:
-; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
-; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]]
-; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -1357,34 +933,6 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE: for.cond.cleanup:
-; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE: for.body:
-; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]]
-; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]]
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -1433,34 +981,6 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP21]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: for.cond.cleanup:
-; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]]
-; CHECK-SVE-MAXBW: for.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]]
-; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
-; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]]
-; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]]
;
entry:
%cmp28.not = icmp ult i32 %N, 2
@@ -1503,55 +1023,3 @@ attributes #0 = { vscale_range(1,16) }
!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!1 = distinct !{!0}
-;.
-; CHECK-NEON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-NEON: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-NEON: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-NEON: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-NEON: [[META4]] = distinct !{[[META5:![0-9]+]]}
-; CHECK-NEON: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-; CHECK-NEON: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-NEON: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-NEON: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-NEON: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-NEON: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-NEON: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-NEON: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-NEON: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-NEON: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK-NEON: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-;.
-; CHECK-SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-SVE: [[META4]] = distinct !{[[META5:![0-9]+]]}
-; CHECK-SVE: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-; CHECK-SVE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-SVE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-SVE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-SVE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-SVE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-SVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-SVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-SVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-SVE: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK-SVE: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-;.
-; CHECK-SVE-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-SVE-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-SVE-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-SVE-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-SVE-MAXBW: [[META4]] = distinct !{[[META5:![0-9]+]]}
-; CHECK-SVE-MAXBW: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-; CHECK-SVE-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-SVE-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-SVE-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-SVE-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-SVE-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-SVE-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-SVE-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-SVE-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-SVE-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK-SVE-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-;.
More information about the llvm-commits
mailing list