[llvm] [LV][EVL] Support in-loop reduction using tail folding with EVL. (PR #90184)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri May 3 02:42:20 PDT 2024
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/90184
>From 701a4d5e011ae014686b430b3ddf1f12f2a58c88 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 23 Apr 2024 00:55:44 -0700
Subject: [PATCH 01/16] Utils support
---
llvm/include/llvm/IR/IRBuilder.h | 19 +++
.../include/llvm/Transforms/Utils/LoopUtils.h | 6 +
llvm/lib/IR/IRBuilder.cpp | 122 ++++++++++++++++++
llvm/lib/Transforms/Utils/LoopUtils.cpp | 56 ++++++++
4 files changed, 203 insertions(+)
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index b6534a1962a2f5..4db1fe5ff93aef 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -746,49 +746,68 @@ class IRBuilderBase {
private:
CallInst *getReductionIntrinsic(Intrinsic::ID ID, Value *Src);
+ // Helper function for creating VP reduce intrinsic call.
+ CallInst *getReductionIntrinsic(Intrinsic::ID ID, Value *Acc, Value *Src,
+ Value *Mask, Value *EVL);
+
public:
/// Create a sequential vector fadd reduction intrinsic of the source vector.
/// The first parameter is a scalar accumulator value. An unordered reduction
/// can be created by adding the reassoc fast-math flag to the resulting
/// sequential reduction.
CallInst *CreateFAddReduce(Value *Acc, Value *Src);
+ CallInst *CreateFAddReduce(Value *Acc, Value *Src, Value *EVL,
+ Value *Mask = nullptr);
/// Create a sequential vector fmul reduction intrinsic of the source vector.
/// The first parameter is a scalar accumulator value. An unordered reduction
/// can be created by adding the reassoc fast-math flag to the resulting
/// sequential reduction.
CallInst *CreateFMulReduce(Value *Acc, Value *Src);
+ CallInst *CreateFMulReduce(Value *Acc, Value *Src, Value *EVL,
+ Value *Mask = nullptr);
/// Create a vector int add reduction intrinsic of the source vector.
CallInst *CreateAddReduce(Value *Src);
+ CallInst *CreateAddReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int mul reduction intrinsic of the source vector.
CallInst *CreateMulReduce(Value *Src);
+ CallInst *CreateMulReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int AND reduction intrinsic of the source vector.
CallInst *CreateAndReduce(Value *Src);
+ CallInst *CreateAndReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int OR reduction intrinsic of the source vector.
CallInst *CreateOrReduce(Value *Src);
+ CallInst *CreateOrReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int XOR reduction intrinsic of the source vector.
CallInst *CreateXorReduce(Value *Src);
+ CallInst *CreateXorReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector integer max reduction intrinsic of the source
/// vector.
CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
+ CallInst *CreateIntMaxReduce(Value *Src, Value *EVL, bool IsSigned = false,
+ Value *Mask = nullptr);
/// Create a vector integer min reduction intrinsic of the source
/// vector.
CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
+ CallInst *CreateIntMinReduce(Value *Src, Value *EVL, bool IsSigned = false,
+ Value *Mask = nullptr);
/// Create a vector float max reduction intrinsic of the source
/// vector.
CallInst *CreateFPMaxReduce(Value *Src);
+ CallInst *CreateFPMaxReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector float min reduction intrinsic of the source
/// vector.
CallInst *CreateFPMinReduce(Value *Src);
+ CallInst *CreateFPMinReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector float maximum reduction intrinsic of the source
/// vector. This variant follows the NaN and signed zero semantic of
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 187ace3a0cbedf..5003fa66100b46 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -403,6 +403,9 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
/// Fast-math-flags are propagated using the IRBuilder's setting.
Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
RecurKind RdxKind);
+Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
+ RecurKind RdxKind, Value *EVL,
+ Value *Mask = nullptr);
/// Create a target reduction of the given vector \p Src for a reduction of the
/// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is
@@ -423,6 +426,9 @@ Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
Value *createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
Value *Start);
+Value *createOrderedReduction(IRBuilderBase &B,
+ const RecurrenceDescriptor &Desc, Value *Src,
+ Value *Start, Value *EVL, Value *Mask = nullptr);
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index d6746d1d438242..d596253744d486 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -414,6 +414,20 @@ CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) {
return CreateCall(Decl, Ops);
}
+CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Acc,
+ Value *Src, Value *Mask,
+ Value *EVL) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ EVL = CreateIntCast(EVL, getInt32Ty(), /*isSigned=*/false);
+ if (!Mask)
+ Mask = CreateVectorSplat(SrcTy->getElementCount(), getTrue());
+ Value *Ops[] = {Acc, Src, Mask, EVL};
+ Type *Tys[] = {SrcTy};
+ auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+ return CreateCall(Decl, Ops);
+}
+
CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
@@ -422,6 +436,11 @@ CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
return CreateCall(Decl, Ops);
}
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src, Value *EVL,
+ Value *Mask) {
+ return getReductionIntrinsic(Intrinsic::vp_reduce_fadd, Acc, Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
@@ -430,46 +449,149 @@ CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
return CreateCall(Decl, Ops);
}
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src, Value *EVL,
+ Value *Mask) {
+ return getReductionIntrinsic(Intrinsic::vp_reduce_fmul, Acc, Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_add, Src);
}
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src, Value *EVL, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(Intrinsic::vp_reduce_add,
+ ConstantInt::get(EltTy, 0), Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_mul, Src);
}
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src, Value *EVL, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(Intrinsic::vp_reduce_mul,
+ ConstantInt::get(EltTy, 1), Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_and, Src);
}
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src, Value *EVL, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(Intrinsic::vp_reduce_and,
+ Constant::getAllOnesValue(EltTy), Src, Mask,
+ EVL);
+}
+
CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_or, Src);
}
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src, Value *EVL, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(Intrinsic::vp_reduce_or,
+ ConstantInt::get(EltTy, 0), Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_xor, Src);
}
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src, Value *EVL, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(Intrinsic::vp_reduce_xor,
+ ConstantInt::get(EltTy, 0), Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
auto ID =
IsSigned ? Intrinsic::vector_reduce_smax : Intrinsic::vector_reduce_umax;
return getReductionIntrinsic(ID, Src);
}
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, Value *EVL,
+ bool IsSigned, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(
+ IsSigned ? Intrinsic::vp_reduce_smax : Intrinsic::vp_reduce_umax,
+ IsSigned ? ConstantInt::get(EltTy, APInt::getSignedMinValue(
+ EltTy->getIntegerBitWidth()))
+ : ConstantInt::get(EltTy, 0),
+ Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
auto ID =
IsSigned ? Intrinsic::vector_reduce_smin : Intrinsic::vector_reduce_umin;
return getReductionIntrinsic(ID, Src);
}
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, Value *EVL,
+ bool IsSigned, Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ return getReductionIntrinsic(
+ IsSigned ? Intrinsic::vp_reduce_smin : Intrinsic::vp_reduce_umin,
+ IsSigned ? ConstantInt::get(EltTy, APInt::getSignedMaxValue(
+ EltTy->getIntegerBitWidth()))
+ : Constant::getAllOnesValue(EltTy),
+ Src, Mask, EVL);
+}
+
CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmax, Src);
}
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, Value *EVL,
+ Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ FastMathFlags FMF = getFastMathFlags();
+ Value *Neutral;
+ if (FMF.noNaNs())
+ Neutral = FMF.noInfs()
+ ? ConstantFP::get(
+ EltTy, APFloat::getLargest(EltTy->getFltSemantics(),
+ /*Negative=*/true))
+ : ConstantFP::getInfinity(EltTy, true);
+ else
+ Neutral = ConstantFP::getQNaN(EltTy, /*Negative=*/true);
+
+ return getReductionIntrinsic(Intrinsic::vp_reduce_fmax, Neutral, Src, Mask,
+ EVL);
+}
+
CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmin, Src);
}
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, Value *EVL,
+ Value *Mask) {
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ auto *EltTy = SrcTy->getElementType();
+ FastMathFlags FMF = getFastMathFlags();
+ Value *Neutral;
+ if (FMF.noNaNs())
+ Neutral = FMF.noInfs()
+ ? ConstantFP::get(
+ EltTy, APFloat::getLargest(EltTy->getFltSemantics(),
+ /*Negative=*/false))
+ : ConstantFP::getInfinity(EltTy, false);
+ else
+ Neutral = ConstantFP::getQNaN(EltTy, /*Negative=*/false);
+
+ return getReductionIntrinsic(Intrinsic::vp_reduce_fmin, Neutral, Src, Mask,
+ EVL);
+}
+
CallInst *IRBuilderBase::CreateFPMaximumReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmaximum, Src);
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 73c5d636782294..d0abcdfb1440ab 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1204,6 +1204,48 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
}
}
+Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
+ RecurKind RdxKind, Value *EVL,
+ Value *Mask) {
+ auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
+ switch (RdxKind) {
+ case RecurKind::Add:
+ return Builder.CreateAddReduce(Src, EVL, Mask);
+ case RecurKind::Mul:
+ return Builder.CreateMulReduce(Src, EVL, Mask);
+ case RecurKind::And:
+ return Builder.CreateAndReduce(Src, EVL, Mask);
+ case RecurKind::Or:
+ return Builder.CreateOrReduce(Src, EVL, Mask);
+ case RecurKind::Xor:
+ return Builder.CreateXorReduce(Src, EVL, Mask);
+ case RecurKind::FMulAdd:
+ case RecurKind::FAdd:
+ return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
+ Src, EVL, Mask);
+ case RecurKind::FMul:
+ return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src, EVL,
+ Mask);
+ case RecurKind::SMax:
+ return Builder.CreateIntMaxReduce(Src, EVL, true, Mask);
+ case RecurKind::SMin:
+ return Builder.CreateIntMinReduce(Src, EVL, true, Mask);
+ case RecurKind::UMax:
+ return Builder.CreateIntMaxReduce(Src, EVL, false, Mask);
+ case RecurKind::UMin:
+ return Builder.CreateIntMinReduce(Src, EVL, false, Mask);
+ case RecurKind::FMax:
+ return Builder.CreateFPMaxReduce(Src, EVL, Mask);
+ case RecurKind::FMin:
+ return Builder.CreateFPMinReduce(Src, EVL, Mask);
+ case RecurKind::FMinimum:
+ case RecurKind::FMaximum:
+ assert(0 && "FMaximum/FMinimum reduction VP intrinsic is not supported.");
+ default:
+ llvm_unreachable("Unhandled opcode");
+ }
+}
+
Value *llvm::createTargetReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
PHINode *OrigPhi) {
@@ -1232,6 +1274,20 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B,
return B.CreateFAddReduce(Start, Src);
}
+Value *llvm::createOrderedReduction(IRBuilderBase &B,
+ const RecurrenceDescriptor &Desc,
+ Value *Src, Value *Start, Value *EVL,
+ Value *Mask) {
+ assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+ Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
+ "Unexpected reduction kind");
+ assert(Src->getType()->isVectorTy() && "Expected a vector type");
+ assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
+ assert(EVL->getType()->isIntegerTy() && "Expected a integer type");
+
+ return B.CreateFAddReduce(Start, Src, EVL, Mask);
+}
+
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
bool IncludeWrapFlags) {
auto *VecOp = dyn_cast<Instruction>(I);
>From d122adaca63b25079913ed3812ba3c3c31f60999 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 24 Apr 2024 23:20:07 -0700
Subject: [PATCH 02/16] Enable lit test case
---
.../LoopVectorize/RISCV/inloop-reduction.ll | 196 ++++++++++++------
1 file changed, 130 insertions(+), 66 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index b876e9d2c1a5cd..3e93f8639168a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-OUTLOOP %s
+; RUN: opt -passes=loop-vectorize -prefer-inloop-reductions -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-INLOOP %s
+
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
-; FIXME: inloop reductions are not supported yet with predicated vectorization.
-
define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; OUTLOOP-LABEL: @add_i16_i32(
; OUTLOOP-NEXT: entry:
@@ -117,69 +117,133 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; INLOOP-NEXT: ret i32 [[R_0_LCSSA]]
;
-; IF-EVL-LABEL: @add_i16_i32(
-; IF-EVL-NEXT: entry:
-; IF-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; IF-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; IF-EVL: for.body.preheader:
-; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
-; IF-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
-; IF-EVL-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
-; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
-; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; IF-EVL: vector.body:
-; IF-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
-; IF-EVL-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; IF-EVL-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IF-EVL: middle.block:
-; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; IF-EVL: for.body:
-; IF-EVL-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IF-EVL-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
-; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
-; IF-EVL-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; IF-EVL: for.cond.cleanup.loopexit:
-; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; IF-EVL: for.cond.cleanup:
-; IF-EVL-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; IF-EVL-NEXT: ret i32 [[R_0_LCSSA]]
+; IF-EVL-OUTLOOP-LABEL: @add_i16_i32(
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL-OUTLOOP: for.body.preheader:
+; IF-EVL-OUTLOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP: vector.ph:
+; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
+; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: vector.body:
+; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
+; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-OUTLOOP: middle.block:
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
+; IF-EVL-OUTLOOP: for.cond.cleanup:
+; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-OUTLOOP-NEXT: ret i32 [[R_0_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: @add_i16_i32(
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL-INLOOP: for.body.preheader:
+; IF-EVL-INLOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8
+; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 8
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = add <vscale x 8 x i32> zeroinitializer, [[TMP8]]
+; IF-EVL-INLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 8 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; IF-EVL-INLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP12]], i32 2, <vscale x 8 x i1> [[TMP10]], <vscale x 8 x i16> poison)
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i32>
+; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP10]], <vscale x 8 x i32> [[TMP13]], <vscale x 8 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; IF-EVL-INLOOP-NEXT: [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-INLOOP: for.cond.cleanup.loopexit:
+; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
+; IF-EVL-INLOOP: for.cond.cleanup:
+; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-INLOOP-NEXT: ret i32 [[R_0_LCSSA]]
;
entry:
%cmp6 = icmp sgt i32 %n, 0
>From 199be198dc806dc481ce2c1e183327f44cba3e02 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 24 Apr 2024 23:21:51 -0700
Subject: [PATCH 03/16] Remove the constraint for reductions
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c2..29bb6b0f966603 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1534,9 +1534,7 @@ class LoopVectorizationCostModel {
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath &&
// FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth() &&
- // FIXME: remove this once reductions are supported.
- Legal->getReductionVars().empty();
+ Legal->isSafeForAnyVectorWidth();
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
>From ba63732b2073807959456fd4783590810d0c88ea Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 24 Apr 2024 23:28:06 -0700
Subject: [PATCH 04/16] update test case
---
.../LoopVectorize/RISCV/inloop-reduction.ll | 88 ++++++++++---------
1 file changed, 48 insertions(+), 40 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 3e93f8639168a0..8e06488cfa5e07 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -140,42 +140,46 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-OUTLOOP: vector.body:
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP7]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EVL_BASED_IV]], i64 0
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
-; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
-; IF-EVL-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
-; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP10]]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP11]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP9]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i16> [[VP_OP_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-OUTLOOP-NEXT: [[TMP16]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP8]], [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL-OUTLOOP: middle.block:
-; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL-OUTLOOP: scalar.ph:
; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL-OUTLOOP: for.body:
; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32
; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit:
-; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
; IF-EVL-OUTLOOP: for.cond.cleanup:
; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -204,42 +208,46 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-INLOOP: vector.body:
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP7]], i32 8, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[EVL_BASED_IV]], i64 0
; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
-; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = add <vscale x 8 x i32> zeroinitializer, [[TMP8]]
-; IF-EVL-INLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 8 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
-; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
-; IF-EVL-INLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP12]], i32 2, <vscale x 8 x i1> [[TMP10]], <vscale x 8 x i16> poison)
-; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i32>
-; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP10]], <vscale x 8 x i32> [[TMP13]], <vscale x 8 x i32> zeroinitializer
-; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP14]])
-; IF-EVL-INLOOP-NEXT: [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = add <vscale x 8 x i32> zeroinitializer, [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 8 x i32> [[BROADCAST_SPLAT]], [[TMP11]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp ule <vscale x 8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP9]]
+; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP14]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
+; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i32> [[TMP15]], <vscale x 8 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP16]])
+; IF-EVL-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP8]], [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
-; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-INLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL-INLOOP: middle.block:
; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL-INLOOP: scalar.ph:
; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL-INLOOP: for.body:
; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32
; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL-INLOOP: for.cond.cleanup.loopexit:
-; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
; IF-EVL-INLOOP: for.cond.cleanup:
; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
>From cdd49651ed7eadb609fb1b8ac1b7e93644412585 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 25 Apr 2024 01:08:32 -0700
Subject: [PATCH 05/16] The initial implementation modeled after
VPWidenLoadEVLRecipe.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 76 +++++++++++++++++++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 53 +++++++++++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 42 +++++-----
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../LoopVectorize/RISCV/inloop-reduction.ll | 19 +++--
5 files changed, 162 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4ac..863db4bc671433 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -843,6 +843,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPDerivedIVSC:
case VPRecipeBase::VPExpandSCEVSC:
case VPRecipeBase::VPInstructionSC:
+ case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
@@ -2110,6 +2111,12 @@ class VPReductionRecipe : public VPSingleDefRecipe {
VPSlotTracker &SlotTracker) const override;
#endif
+ /// Return the recurrence decriptor for the in-loop reduction.
+ const RecurrenceDescriptor &getRecurrenceDescriptor() const {
+ return RdxDesc;
+ }
+ /// Return true if the in-loop reduction is ordered.
+ bool isOrdered() const { return IsOrdered; };
/// The VPValue of the scalar Chain being accumulated.
VPValue *getChainOp() const { return getOperand(0); }
/// The VPValue of the vector value to be reduced.
@@ -2120,6 +2127,75 @@ class VPReductionRecipe : public VPSingleDefRecipe {
}
};
+/// A recipe to represent inloop reduction operations with vector-predication
+/// intrinsics, performing a reduction on a vector operand with the explicit
+/// vector length (EVL) into a scalar value, and adding the result to a chain.
+/// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
+class VPReductionEVLRecipe : public VPSingleDefRecipe {
+ /// The recurrence decriptor for the reduction in question.
+ const RecurrenceDescriptor &RdxDesc;
+ bool IsOrdered;
+
+ VPReductionEVLRecipe(const RecurrenceDescriptor &R, Instruction *I,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *EVL,
+ VPValue *CondOp, bool IsOrdered)
+ : VPSingleDefRecipe(VPDef::VPReductionEVLSC,
+ ArrayRef<VPValue *>({ChainOp, VecOp, EVL}), I),
+ RdxDesc(R), IsOrdered(IsOrdered) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
+public:
+ VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL)
+ : VPSingleDefRecipe(
+ VPDef::VPReductionEVLSC,
+ ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}),
+ R->getUnderlyingInstr()),
+ RdxDesc(R->getRecurrenceDescriptor()), IsOrdered(R->isOrdered()) {
+ VPValue *CondOp = R->getCondOp();
+ if (CondOp)
+ addOperand(CondOp);
+ };
+
+ ~VPReductionEVLRecipe() override = default;
+
+ VPReductionEVLRecipe *clone() override {
+ return new VPReductionEVLRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
+ getVecOp(), getEVL(), getCondOp(),
+ IsOrdered);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
+
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The VPValue of the scalar Chain being accumulated.
+ VPValue *getChainOp() const { return getOperand(0); }
+ /// The VPValue of the vector value to be reduced.
+ VPValue *getVecOp() const { return getOperand(1); }
+ /// The VPValue of the explicit vector length.
+ VPValue *getEVL() const { return getOperand(2); }
+ /// The VPValue of the condition for the block.
+ VPValue *getCondOp() const {
+ return getNumOperands() > 3 ? getOperand(3) : nullptr;
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getEVL();
+ }
+};
+
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c82..6c370550114950 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1577,6 +1577,36 @@ void VPReductionRecipe::execute(VPTransformState &State) {
}
}
+void VPReductionEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Reduction being replicated.");
+ assert(State.UF == 1 &&
+ "Expected only UF == 1 when vectorizing with explicit vector length.");
+
+ auto &Builder = State.Builder;
+ // Propagate the fast-math flags carried by the underlying instruction.
+ IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
+ Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
+ Value *VecOp = State.get(getVecOp(), 0);
+ Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+ Value *Cond = getCondOp() ? State.get(getCondOp(), 0) : nullptr;
+
+ Value *NewRed;
+ if (IsOrdered) {
+ NewRed = createOrderedReduction(Builder, RdxDesc, VecOp, Prev, EVL, Cond);
+ } else {
+ NewRed = createSimpleTargetReduction(Builder, VecOp, Kind, EVL, Cond);
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+ NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
+ else
+ NewRed = Builder.CreateBinOp(
+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
+ }
+ State.set(this, NewRed, 0, /*IsScalar*/ true);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -1598,6 +1628,29 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " (with final reduction value stored in invariant address sank "
"outside of loop)";
}
+
+void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " +";
+ if (isa<FPMathOperator>(getUnderlyingInstr()))
+ O << getUnderlyingInstr()->getFastMathFlags();
+ O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
+ getVecOp()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (getCondOp()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+ if (RdxDesc.IntermediateStore)
+ O << " (with final reduction value stored in invariant address sank "
+ "outside of loop)";
+}
#endif
bool VPReplicateRecipe::shouldPack() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe634..6a8912cdd68160 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1342,25 +1342,29 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
- if (!MemR)
- continue;
- assert(!MemR->isReverse() &&
- "Reversed memory operations not supported yet.");
- VPValue *OrigMask = MemR->getMask();
- assert(OrigMask && "Unmasked widen memory recipe when folding tail");
- VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
- if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
- auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
- N->insertBefore(L);
- L->replaceAllUsesWith(N);
- L->eraseFromParent();
- } else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
- auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
- N->insertBefore(S);
- S->eraseFromParent();
- } else {
- llvm_unreachable("unsupported recipe");
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+ assert(!MemR->isReverse() &&
+ "Reversed memory operations not supported yet.");
+ VPValue *OrigMask = MemR->getMask();
+ assert(OrigMask && "Unmasked widen memory recipe when folding tail");
+ VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
+ if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
+ auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
+ N->insertBefore(L);
+ L->replaceAllUsesWith(N);
+ L->eraseFromParent();
+ } else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
+ auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
+ N->insertBefore(S);
+ S->eraseFromParent();
+ } else {
+ llvm_unreachable("unsupported recipe");
+ }
+ } else if (auto *RedR = dyn_cast<VPReductionRecipe>(U)) {
+ auto *N = new VPReductionEVLRecipe(RedR, VPEVL);
+ N->insertBefore(RedR);
+ RedR->replaceAllUsesWith(N);
+ RedR->eraseFromParent();
}
}
recursivelyDeleteDeadRecipes(HeaderMask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 96d04271850f70..ae72fd9f45dc83 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -347,6 +347,7 @@ class VPDef {
VPExpandSCEVSC,
VPInstructionSC,
VPInterleaveSC,
+ VPReductionEVLSC,
VPReductionSC,
VPReplicateSC,
VPScalarCastSC,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 8e06488cfa5e07..0d7e95ac5db91d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -209,7 +209,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-INLOOP: vector.body:
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP7]], i32 8, i1 true)
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[EVL_BASED_IV]], 0
@@ -223,31 +223,30 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP14]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP8]])
; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
-; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i32> [[TMP15]], <vscale x 8 x i32> zeroinitializer
-; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP16]])
-; IF-EVL-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP15]], <vscale x 8 x i1> [[TMP12]], i32 [[TMP8]])
+; IF-EVL-INLOOP-NEXT: [[TMP17]] = add i32 [[TMP16]], [[VEC_PHI]]
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP8]], [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
-; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL-INLOOP: middle.block:
; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL-INLOOP: scalar.ph:
; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL-INLOOP: for.body:
; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32
+; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL-INLOOP: for.cond.cleanup.loopexit:
-; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
; IF-EVL-INLOOP: for.cond.cleanup:
; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
>From 7ae27f2bf3153b18884534a814472958f689b5b9 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 25 Apr 2024 02:27:27 -0700
Subject: [PATCH 06/16] Add test case for ordered reduction
---
...e-force-tail-with-evl-ordered-reduction.ll | 112 ++++++++++++++++++
1 file changed, 112 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
new file mode 100644
index 00000000000000..21fa09594fa367
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
+
+define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
+; IF-EVL-LABEL: @fadd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19]] = call float @llvm.vp.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = fadd float [[TMP22]], [[SUM_07]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @fadd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %add = fadd float %0, %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %add
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
>From 18cf98f36e46bc8e50094d6f23d564a42c387b50 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 25 Apr 2024 06:52:52 -0700
Subject: [PATCH 07/16] Add test case for all reduction kinds
---
...vectorize-force-tail-with-evl-reduction.ll | 1429 +++++++++++++++++
1 file changed, 1429 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
new file mode 100644
index 00000000000000..0c64a438397515
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -0,0 +1,1429 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
+
+define i32 @add(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @add(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20]] = add i32 [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP23]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @add(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = add nsw i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %add
+}
+
+; not support mul reduction for scalable vector
+define i32 @mul(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @mul(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]]
+;
+; NO-VP-LABEL: @mul(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %mul, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %mul = mul nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %mul
+}
+
+define i32 @or(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @or(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.or.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20]] = or i32 [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[OR]] = or i32 [[TMP23]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[OR_LCSSA]]
+;
+; NO-VP-LABEL: @or(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[OR]] = or i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[OR_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %or, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %or = or i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %or
+}
+
+define i32 @and(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @and(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.and.nxv4i32(i32 -1, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20]] = and i32 [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[AND]] = and i32 [[TMP23]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[AND_LCSSA]]
+;
+; NO-VP-LABEL: @and(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[AND]] = and i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[AND_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %and, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %and = and i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %and
+}
+
+define i32 @xor(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @xor(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.xor.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20]] = xor i32 [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP23]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]]
+;
+; NO-VP-LABEL: @xor(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[XOR]] = xor i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[XOR_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %xor, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %xor = xor i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %xor
+}
+
+define i32 @smin(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @smin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP19]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP22]], [[RDX]]
+; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP22]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]]
+;
+; NO-VP-LABEL: @smin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[SMIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %smin, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp slt i32 %0, %rdx
+ %smin = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %smin
+}
+
+define i32 @smax(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @smax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -2147483648, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP19]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP22]], [[RDX]]
+; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP22]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]]
+;
+; NO-VP-LABEL: @smax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[SMAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %smax, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp sgt i32 %0, %rdx
+ %smax = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %smax
+}
+
+define i32 @umin(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @umin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 -1, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP19]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP22]], [[RDX]]
+; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP22]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]]
+;
+; NO-VP-LABEL: @umin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[UMIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %umin, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp ult i32 %0, %rdx
+ %umin = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %umin
+}
+
+define i32 @umax(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @umax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP19]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP22]], [[RDX]]
+; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP22]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]]
+;
+; NO-VP-LABEL: @umax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret i32 [[UMAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %umax, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp ugt i32 %0, %rdx
+ %umax = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %umax
+}
+
+define float @fadd(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fadd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20]] = fadd reassoc float [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP23]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @fadd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = fadd reassoc float [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %add = fadd reassoc float %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %add
+}
+
+; not support fmul reduction for scalable vector
+define float @fmul(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmul(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MUL_LCSSA]]
+;
+; NO-VP-LABEL: @fmul(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %mul, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %mul = fmul reassoc float %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %mul
+}
+
+define float @fmin(ptr %a, i64 %n, float %start) #0 {
+; IF-EVL-LABEL: @fmin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vp.reduce.fmin.nxv4f32(float 0x47EFFFFFE0000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP19]], float [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP22]], [[RDX]]
+; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP22]], float [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
+;
+; NO-VP-LABEL: @fmin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[MIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %min, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp = fcmp fast olt float %0, %rdx
+ %min = select i1 %cmp, float %0, float %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %min
+}
+
+define float @fmax(ptr %a, i64 %n, float %start) #0 {
+; IF-EVL-LABEL: @fmax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vp.reduce.fmax.nxv4f32(float 0xC7EFFFFFE0000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP19]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP19]], float [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP22]], [[RDX]]
+; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP22]], float [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
+;
+; NO-VP-LABEL: @fmax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP0]], [[RDX]]
+; NO-VP-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[MAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %max, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp = fcmp fast ogt float %0, %rdx
+ %max = select i1 %cmp, float %0, float %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %max
+}
+
+define float @fminimum(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fminimum(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
+;
+; NO-VP-LABEL: @fminimum(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[MIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %min, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %min = tail call float @llvm.minimum.f32(float %rdx, float %0)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %min
+}
+
+define float @fmaximum(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmaximum(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
+;
+; NO-VP-LABEL: @fmaximum(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[MAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %max, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %max = tail call float @llvm.maximum.f32(float %rdx, float %0)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %max
+}
+
+define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmuladd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP21:%.*]] = fmul reassoc <vscale x 4 x float> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP21]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP23]] = fadd reassoc float [[TMP22]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[RDX]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[MULADD_LCSSA]]
+;
+; NO-VP-LABEL: @fmuladd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[RDX]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[MULADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
+ %1 = load float, ptr %arrayidx2, align 4
+ %muladd = tail call reassoc float @llvm.fmuladd.f32(float %0, float %1, float %rdx)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %muladd
+}
+
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.fmuladd.f32(float, float, float)
+
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
>From aac2a6147101572a6fefaf5b948b515039a0d83f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 25 Apr 2024 07:21:29 -0700
Subject: [PATCH 08/16] Block FMinimum and FMaximum
---
.../Transforms/Vectorize/LoopVectorize.cpp | 14 ++-
...vectorize-force-tail-with-evl-reduction.ll | 102 ++++++++++++++----
2 files changed, 97 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 29bb6b0f966603..1db531e170a4bf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1526,6 +1526,17 @@ class LoopVectorizationCostModel {
ForceTailFoldingStyle.getValue());
if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
return;
+
+ // Block folding with EVL since vector-predication intrinsics have not
+ // support FMinimum and FMaximum reduction.
+ // FIXME: remove this check once llvm.vp.reduce.fminimum/fmaximum are
+ // supported
+ bool ContainsFMinimumOrFMaximumReduction =
+ any_of(Legal->getReductionVars(), [&](auto &Reduction) {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ return Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum;
+ });
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
@@ -1534,7 +1545,8 @@ class LoopVectorizationCostModel {
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath &&
// FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth();
+ Legal->isSafeForAnyVectorWidth() &&
+ !ContainsFMinimumOrFMaximumReduction;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 0c64a438397515..0c4a9fae81950e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -1214,18 +1214,51 @@ for.end:
define float @fminimum(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fminimum(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP8]])
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
;
; NO-VP-LABEL: @fminimum(
@@ -1264,18 +1297,51 @@ for.end:
define float @fmaximum(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fmaximum(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP8]])
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
;
; NO-VP-LABEL: @fmaximum(
@@ -1360,7 +1426,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -1377,7 +1443,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP26]], float [[TMP27]], float [[RDX]])
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MULADD_LCSSA]]
>From dc1695a592bd6c286394a5cf6212f3d81c1a2d27 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 26 Apr 2024 01:40:19 -0700
Subject: [PATCH 09/16] remove clone function
---
llvm/lib/Transforms/Vectorize/VPlan.h | 14 +-------------
1 file changed, 1 insertion(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 863db4bc671433..a444064dab692a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2136,16 +2136,6 @@ class VPReductionEVLRecipe : public VPSingleDefRecipe {
const RecurrenceDescriptor &RdxDesc;
bool IsOrdered;
- VPReductionEVLRecipe(const RecurrenceDescriptor &R, Instruction *I,
- VPValue *ChainOp, VPValue *VecOp, VPValue *EVL,
- VPValue *CondOp, bool IsOrdered)
- : VPSingleDefRecipe(VPDef::VPReductionEVLSC,
- ArrayRef<VPValue *>({ChainOp, VecOp, EVL}), I),
- RdxDesc(R), IsOrdered(IsOrdered) {
- if (CondOp)
- addOperand(CondOp);
- }
-
public:
VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL)
: VPSingleDefRecipe(
@@ -2161,9 +2151,7 @@ class VPReductionEVLRecipe : public VPSingleDefRecipe {
~VPReductionEVLRecipe() override = default;
VPReductionEVLRecipe *clone() override {
- return new VPReductionEVLRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
- getVecOp(), getEVL(), getCondOp(),
- IsOrdered);
+ llvm_unreachable("cloning not implemented yet");
}
VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
>From 5a62288b1e3271490c300407c1987d0ba245e673 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 1 May 2024 21:01:21 -0700
Subject: [PATCH 10/16] Add VPlan test
---
.../Transforms/Vectorize/VPlanTest.cpp | 29 +++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 64e9c06db3fe8b..32b2a63d822c23 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1128,6 +1128,20 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
EXPECT_FALSE(Recipe.mayReadOrWriteMemory());
}
+ {
+ VPValue ChainOp;
+ VPValue VecOp;
+ VPValue CondOp;
+ VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
+ &VecOp, false);
+ VPValue EVL;
+ VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL);
+ EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
+ EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
+ EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
+ EXPECT_FALSE(EVLRecipe.mayReadOrWriteMemory());
+ }
+
{
auto *Load =
new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
@@ -1463,6 +1477,21 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) {
EXPECT_TRUE(isa<VPUser>(BaseR));
}
+TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
+ LLVMContext C;
+
+ VPValue ChainOp;
+ VPValue VecOp;
+ VPValue CondOp;
+ VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
+ &VecOp, false);
+ VPValue EVL;
+ VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL);
+ EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
+ VPRecipeBase *BaseR = &EVLRecipe;
+ EXPECT_TRUE(isa<VPUser>(BaseR));
+}
+
struct VPDoubleValueDef : public VPRecipeBase {
VPDoubleValueDef(ArrayRef<VPValue *> Operands) : VPRecipeBase(99, Operands) {
new VPValue(nullptr, this);
>From 172745025d81388ef8104006ffd7e6faa3d1b04a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 2 May 2024 06:13:46 -0700
Subject: [PATCH 11/16] Move utils to VectorBuilder
---
llvm/include/llvm/IR/VectorBuilder.h | 15 ++++++
.../include/llvm/Transforms/Utils/LoopUtils.h | 6 +++
llvm/lib/IR/VectorBuilder.cpp | 48 +++++++++++++++++++
llvm/lib/Transforms/Utils/LoopUtils.cpp | 27 +++++++++++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++--
...vectorize-force-tail-with-evl-reduction.ll | 4 +-
6 files changed, 105 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index 301edaed70fe88..10ab6ee4cd1cb0 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -15,6 +15,7 @@
#ifndef LLVM_IR_VECTORBUILDER_H
#define LLVM_IR_VECTORBUILDER_H
+#include "llvm/Analysis/IVDescriptors.h"
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/InstrTypes.h>
#include <llvm/IR/Instruction.h>
@@ -57,6 +58,11 @@ class VectorBuilder {
return RetType();
}
+ // Helper function for creating VP intrinsic call.
+ Value *createVectorInstructionImpl(Intrinsic::ID VPID, Type *ReturnTy,
+ ArrayRef<Value *> VecOpArray,
+ const Twine &Name = Twine());
+
public:
VectorBuilder(IRBuilderBase &Builder,
Behavior ErrorHandling = Behavior::ReportAndAbort)
@@ -92,6 +98,15 @@ class VectorBuilder {
Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
+
+ // Emit a VP reduction intrinsic call for recurrence \p Kind.
+ // \p Kind The kind of recurrence
+ // \p ValTy The type of operand which the reduction operation is
+ // performed.
+ // \p VecOpArray The operand list.
+ Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+ ArrayRef<Value *> VecOpArray,
+ const Twine &Name = Twine());
};
} // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 5003fa66100b46..495836c95b5e46 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -15,6 +15,7 @@
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
namespace llvm {
@@ -406,6 +407,8 @@ Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
RecurKind RdxKind, Value *EVL,
Value *Mask = nullptr);
+Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src,
+ const RecurrenceDescriptor &Desc);
/// Create a target reduction of the given vector \p Src for a reduction of the
/// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is
@@ -429,6 +432,9 @@ Value *createOrderedReduction(IRBuilderBase &B,
Value *createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
Value *Start, Value *EVL, Value *Mask = nullptr);
+Value *createOrderedReduction(VectorBuilder &VB,
+ const RecurrenceDescriptor &Desc, Value *Src,
+ Value *Start);
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index c07bc0561fba93..3bdedb8ffb2b5f 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -57,7 +57,55 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
auto VPID = VPIntrinsic::getForOpcode(Opcode);
if (VPID == Intrinsic::not_intrinsic)
return returnWithError<Value *>("No VPIntrinsic for this opcode");
+ return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
+}
+
+Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+ ArrayRef<Value *> InstOpArray,
+ const Twine &Name) {
+ auto getForRecurKind = [](RecurKind Kind) {
+ switch (Kind) {
+ case RecurKind::Add:
+ return Intrinsic::vp_reduce_add;
+ case RecurKind::Mul:
+ return Intrinsic::vp_reduce_mul;
+ case RecurKind::And:
+ return Intrinsic::vp_reduce_and;
+ case RecurKind::Or:
+ return Intrinsic::vp_reduce_or;
+ case RecurKind::Xor:
+ return Intrinsic::vp_reduce_xor;
+ case RecurKind::FMulAdd:
+ case RecurKind::FAdd:
+ return Intrinsic::vp_reduce_fadd;
+ case RecurKind::FMul:
+ return Intrinsic::vp_reduce_fmul;
+ case RecurKind::SMax:
+ return Intrinsic::vp_reduce_smax;
+ case RecurKind::SMin:
+ return Intrinsic::vp_reduce_smin;
+ case RecurKind::UMax:
+ return Intrinsic::vp_reduce_umax;
+ case RecurKind::UMin:
+ return Intrinsic::vp_reduce_umin;
+ case RecurKind::FMax:
+ return Intrinsic::vp_reduce_fmax;
+ case RecurKind::FMin:
+ return Intrinsic::vp_reduce_fmin;
+ default:
+ return Intrinsic::not_intrinsic;
+ }
+ };
+ auto VPID = getForRecurKind(Kind);
+ if (VPID == Intrinsic::not_intrinsic)
+ return returnWithError<Value *>("No VPIntrinsic for this reduction");
+ return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
+}
+Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID,
+ Type *ReturnTy,
+ ArrayRef<Value *> InstOpArray,
+ const Twine &Name) {
auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
size_t NumInstParams = InstOpArray.size();
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index d0abcdfb1440ab..c9e3727a5a3256 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1246,6 +1246,19 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
}
}
+Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
+ const RecurrenceDescriptor &Desc) {
+ RecurKind Kind = Desc.getRecurrenceKind();
+ assert(Kind != RecurKind::FMinimum && Kind != RecurKind::FMaximum &&
+ "FMaximum/FMinimum reduction VP intrinsic is not supported.");
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ Type *SrcEltTy = SrcTy->getElementType();
+ Value *Iden =
+ Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
+ Value *Ops[] = {Iden, Src};
+ return VBuilder.createSimpleTargetReduction(Kind, SrcTy, Ops);
+}
+
Value *llvm::createTargetReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
PHINode *OrigPhi) {
@@ -1288,6 +1301,20 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B,
return B.CreateFAddReduce(Start, Src, EVL, Mask);
}
+Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
+ const RecurrenceDescriptor &Desc,
+ Value *Src, Value *Start) {
+ assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+ Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
+ "Unexpected reduction kind");
+ assert(Src->getType()->isVectorTy() && "Expected a vector type");
+ assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
+
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ Value *Ops[] = {Start, Src};
+ return VBuilder.createSimpleTargetReduction(RecurKind::FAdd, SrcTy, Ops);
+}
+
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
bool IncludeWrapFlags) {
auto *VecOp = dyn_cast<Instruction>(I);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6c370550114950..7039ce2c0e1fba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1591,13 +1591,17 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
Value *VecOp = State.get(getVecOp(), 0);
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
- Value *Cond = getCondOp() ? State.get(getCondOp(), 0) : nullptr;
+
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL);
+ if (getCondOp())
+ VBuilder.setMask(State.get(getCondOp(), 0));
Value *NewRed;
if (IsOrdered) {
- NewRed = createOrderedReduction(Builder, RdxDesc, VecOp, Prev, EVL, Cond);
+ NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
} else {
- NewRed = createSimpleTargetReduction(Builder, VecOp, Kind, EVL, Cond);
+ NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
else
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 0c4a9fae81950e..b1ef442f705fce 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -1046,7 +1046,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vp.reduce.fmin.nxv4f32(float 0x47EFFFFFE0000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vp.reduce.fmin.nxv4f32(float 0x7FF0000000000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP19]], [[VEC_PHI]]
; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP19]], float [[VEC_PHI]]
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
@@ -1148,7 +1148,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vp.reduce.fmax.nxv4f32(float 0xC7EFFFFFE0000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vp.reduce.fmax.nxv4f32(float 0xFFF0000000000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP19]], [[VEC_PHI]]
; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP19]], float [[VEC_PHI]]
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
>From 1f8b628c252e1823ec08d7724e7b4973a7653fb7 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 2 May 2024 06:16:08 -0700
Subject: [PATCH 12/16] Revert "Utils support"
This reverts commit 701a4d5e011ae014686b430b3ddf1f12f2a58c88.
---
llvm/include/llvm/IR/IRBuilder.h | 19 ---
.../include/llvm/Transforms/Utils/LoopUtils.h | 6 -
llvm/lib/IR/IRBuilder.cpp | 122 ------------------
llvm/lib/Transforms/Utils/LoopUtils.cpp | 56 --------
4 files changed, 203 deletions(-)
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 4db1fe5ff93aef..b6534a1962a2f5 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -746,68 +746,49 @@ class IRBuilderBase {
private:
CallInst *getReductionIntrinsic(Intrinsic::ID ID, Value *Src);
- // Helper function for creating VP reduce intrinsic call.
- CallInst *getReductionIntrinsic(Intrinsic::ID ID, Value *Acc, Value *Src,
- Value *Mask, Value *EVL);
-
public:
/// Create a sequential vector fadd reduction intrinsic of the source vector.
/// The first parameter is a scalar accumulator value. An unordered reduction
/// can be created by adding the reassoc fast-math flag to the resulting
/// sequential reduction.
CallInst *CreateFAddReduce(Value *Acc, Value *Src);
- CallInst *CreateFAddReduce(Value *Acc, Value *Src, Value *EVL,
- Value *Mask = nullptr);
/// Create a sequential vector fmul reduction intrinsic of the source vector.
/// The first parameter is a scalar accumulator value. An unordered reduction
/// can be created by adding the reassoc fast-math flag to the resulting
/// sequential reduction.
CallInst *CreateFMulReduce(Value *Acc, Value *Src);
- CallInst *CreateFMulReduce(Value *Acc, Value *Src, Value *EVL,
- Value *Mask = nullptr);
/// Create a vector int add reduction intrinsic of the source vector.
CallInst *CreateAddReduce(Value *Src);
- CallInst *CreateAddReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int mul reduction intrinsic of the source vector.
CallInst *CreateMulReduce(Value *Src);
- CallInst *CreateMulReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int AND reduction intrinsic of the source vector.
CallInst *CreateAndReduce(Value *Src);
- CallInst *CreateAndReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int OR reduction intrinsic of the source vector.
CallInst *CreateOrReduce(Value *Src);
- CallInst *CreateOrReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector int XOR reduction intrinsic of the source vector.
CallInst *CreateXorReduce(Value *Src);
- CallInst *CreateXorReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector integer max reduction intrinsic of the source
/// vector.
CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
- CallInst *CreateIntMaxReduce(Value *Src, Value *EVL, bool IsSigned = false,
- Value *Mask = nullptr);
/// Create a vector integer min reduction intrinsic of the source
/// vector.
CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
- CallInst *CreateIntMinReduce(Value *Src, Value *EVL, bool IsSigned = false,
- Value *Mask = nullptr);
/// Create a vector float max reduction intrinsic of the source
/// vector.
CallInst *CreateFPMaxReduce(Value *Src);
- CallInst *CreateFPMaxReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector float min reduction intrinsic of the source
/// vector.
CallInst *CreateFPMinReduce(Value *Src);
- CallInst *CreateFPMinReduce(Value *Src, Value *EVL, Value *Mask = nullptr);
/// Create a vector float maximum reduction intrinsic of the source
/// vector. This variant follows the NaN and signed zero semantic of
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 495836c95b5e46..fc4a1538dcf732 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -404,9 +404,6 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
/// Fast-math-flags are propagated using the IRBuilder's setting.
Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
RecurKind RdxKind);
-Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
- RecurKind RdxKind, Value *EVL,
- Value *Mask = nullptr);
Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src,
const RecurrenceDescriptor &Desc);
@@ -429,9 +426,6 @@ Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
Value *createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
Value *Start);
-Value *createOrderedReduction(IRBuilderBase &B,
- const RecurrenceDescriptor &Desc, Value *Src,
- Value *Start, Value *EVL, Value *Mask = nullptr);
Value *createOrderedReduction(VectorBuilder &VB,
const RecurrenceDescriptor &Desc, Value *Src,
Value *Start);
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index d596253744d486..d6746d1d438242 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -414,20 +414,6 @@ CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) {
return CreateCall(Decl, Ops);
}
-CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Acc,
- Value *Src, Value *Mask,
- Value *EVL) {
- Module *M = GetInsertBlock()->getParent()->getParent();
- auto *SrcTy = cast<VectorType>(Src->getType());
- EVL = CreateIntCast(EVL, getInt32Ty(), /*isSigned=*/false);
- if (!Mask)
- Mask = CreateVectorSplat(SrcTy->getElementCount(), getTrue());
- Value *Ops[] = {Acc, Src, Mask, EVL};
- Type *Tys[] = {SrcTy};
- auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
- return CreateCall(Decl, Ops);
-}
-
CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
@@ -436,11 +422,6 @@ CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
return CreateCall(Decl, Ops);
}
-CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src, Value *EVL,
- Value *Mask) {
- return getReductionIntrinsic(Intrinsic::vp_reduce_fadd, Acc, Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
@@ -449,149 +430,46 @@ CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
return CreateCall(Decl, Ops);
}
-CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src, Value *EVL,
- Value *Mask) {
- return getReductionIntrinsic(Intrinsic::vp_reduce_fmul, Acc, Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_add, Src);
}
-CallInst *IRBuilderBase::CreateAddReduce(Value *Src, Value *EVL, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(Intrinsic::vp_reduce_add,
- ConstantInt::get(EltTy, 0), Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_mul, Src);
}
-CallInst *IRBuilderBase::CreateMulReduce(Value *Src, Value *EVL, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(Intrinsic::vp_reduce_mul,
- ConstantInt::get(EltTy, 1), Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_and, Src);
}
-CallInst *IRBuilderBase::CreateAndReduce(Value *Src, Value *EVL, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(Intrinsic::vp_reduce_and,
- Constant::getAllOnesValue(EltTy), Src, Mask,
- EVL);
-}
-
CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_or, Src);
}
-CallInst *IRBuilderBase::CreateOrReduce(Value *Src, Value *EVL, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(Intrinsic::vp_reduce_or,
- ConstantInt::get(EltTy, 0), Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_xor, Src);
}
-CallInst *IRBuilderBase::CreateXorReduce(Value *Src, Value *EVL, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(Intrinsic::vp_reduce_xor,
- ConstantInt::get(EltTy, 0), Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
auto ID =
IsSigned ? Intrinsic::vector_reduce_smax : Intrinsic::vector_reduce_umax;
return getReductionIntrinsic(ID, Src);
}
-CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, Value *EVL,
- bool IsSigned, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(
- IsSigned ? Intrinsic::vp_reduce_smax : Intrinsic::vp_reduce_umax,
- IsSigned ? ConstantInt::get(EltTy, APInt::getSignedMinValue(
- EltTy->getIntegerBitWidth()))
- : ConstantInt::get(EltTy, 0),
- Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
auto ID =
IsSigned ? Intrinsic::vector_reduce_smin : Intrinsic::vector_reduce_umin;
return getReductionIntrinsic(ID, Src);
}
-CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, Value *EVL,
- bool IsSigned, Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- return getReductionIntrinsic(
- IsSigned ? Intrinsic::vp_reduce_smin : Intrinsic::vp_reduce_umin,
- IsSigned ? ConstantInt::get(EltTy, APInt::getSignedMaxValue(
- EltTy->getIntegerBitWidth()))
- : Constant::getAllOnesValue(EltTy),
- Src, Mask, EVL);
-}
-
CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmax, Src);
}
-CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, Value *EVL,
- Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- FastMathFlags FMF = getFastMathFlags();
- Value *Neutral;
- if (FMF.noNaNs())
- Neutral = FMF.noInfs()
- ? ConstantFP::get(
- EltTy, APFloat::getLargest(EltTy->getFltSemantics(),
- /*Negative=*/true))
- : ConstantFP::getInfinity(EltTy, true);
- else
- Neutral = ConstantFP::getQNaN(EltTy, /*Negative=*/true);
-
- return getReductionIntrinsic(Intrinsic::vp_reduce_fmax, Neutral, Src, Mask,
- EVL);
-}
-
CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmin, Src);
}
-CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, Value *EVL,
- Value *Mask) {
- auto *SrcTy = cast<VectorType>(Src->getType());
- auto *EltTy = SrcTy->getElementType();
- FastMathFlags FMF = getFastMathFlags();
- Value *Neutral;
- if (FMF.noNaNs())
- Neutral = FMF.noInfs()
- ? ConstantFP::get(
- EltTy, APFloat::getLargest(EltTy->getFltSemantics(),
- /*Negative=*/false))
- : ConstantFP::getInfinity(EltTy, false);
- else
- Neutral = ConstantFP::getQNaN(EltTy, /*Negative=*/false);
-
- return getReductionIntrinsic(Intrinsic::vp_reduce_fmin, Neutral, Src, Mask,
- EVL);
-}
-
CallInst *IRBuilderBase::CreateFPMaximumReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmaximum, Src);
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c9e3727a5a3256..660dd60658a948 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1204,48 +1204,6 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
}
}
-Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
- RecurKind RdxKind, Value *EVL,
- Value *Mask) {
- auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
- switch (RdxKind) {
- case RecurKind::Add:
- return Builder.CreateAddReduce(Src, EVL, Mask);
- case RecurKind::Mul:
- return Builder.CreateMulReduce(Src, EVL, Mask);
- case RecurKind::And:
- return Builder.CreateAndReduce(Src, EVL, Mask);
- case RecurKind::Or:
- return Builder.CreateOrReduce(Src, EVL, Mask);
- case RecurKind::Xor:
- return Builder.CreateXorReduce(Src, EVL, Mask);
- case RecurKind::FMulAdd:
- case RecurKind::FAdd:
- return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
- Src, EVL, Mask);
- case RecurKind::FMul:
- return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src, EVL,
- Mask);
- case RecurKind::SMax:
- return Builder.CreateIntMaxReduce(Src, EVL, true, Mask);
- case RecurKind::SMin:
- return Builder.CreateIntMinReduce(Src, EVL, true, Mask);
- case RecurKind::UMax:
- return Builder.CreateIntMaxReduce(Src, EVL, false, Mask);
- case RecurKind::UMin:
- return Builder.CreateIntMinReduce(Src, EVL, false, Mask);
- case RecurKind::FMax:
- return Builder.CreateFPMaxReduce(Src, EVL, Mask);
- case RecurKind::FMin:
- return Builder.CreateFPMinReduce(Src, EVL, Mask);
- case RecurKind::FMinimum:
- case RecurKind::FMaximum:
- assert(0 && "FMaximum/FMinimum reduction VP intrinsic is not supported.");
- default:
- llvm_unreachable("Unhandled opcode");
- }
-}
-
Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
const RecurrenceDescriptor &Desc) {
RecurKind Kind = Desc.getRecurrenceKind();
@@ -1287,20 +1245,6 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B,
return B.CreateFAddReduce(Start, Src);
}
-Value *llvm::createOrderedReduction(IRBuilderBase &B,
- const RecurrenceDescriptor &Desc,
- Value *Src, Value *Start, Value *EVL,
- Value *Mask) {
- assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
- Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
- "Unexpected reduction kind");
- assert(Src->getType()->isVectorTy() && "Expected a vector type");
- assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
- assert(EVL->getType()->isIntegerTy() && "Expected a integer type");
-
- return B.CreateFAddReduce(Start, Src, EVL, Mask);
-}
-
Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
const RecurrenceDescriptor &Desc,
Value *Src, Value *Start) {
>From 2352368231f6ac2d374d76e85eaab543a95265a2 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 3 May 2024 00:29:24 -0700
Subject: [PATCH 13/16] Fix comments format
---
llvm/include/llvm/IR/VectorBuilder.h | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index 10ab6ee4cd1cb0..66b9e07763785b 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -99,11 +99,11 @@ class VectorBuilder {
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
- // Emit a VP reduction intrinsic call for recurrence \p Kind.
- // \p Kind The kind of recurrence
- // \p ValTy The type of operand which the reduction operation is
- // performed.
- // \p VecOpArray The operand list.
+ /// Emit a VP reduction intrinsic call for recurrence kind.
+ /// \param Kind The kind of recurrence
+ /// \param ValTy The type of operand which the reduction operation is
+ /// performed.
+ /// \param VecOpArray The operand list.
Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
>From 057d77ff0c936b0285c7fb69585b72500cf26386 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 3 May 2024 01:33:51 -0700
Subject: [PATCH 14/16] Allow null underlyInstr
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a444064dab692a..77b48b6a319fcf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2141,7 +2141,7 @@ class VPReductionEVLRecipe : public VPSingleDefRecipe {
: VPSingleDefRecipe(
VPDef::VPReductionEVLSC,
ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}),
- R->getUnderlyingInstr()),
+ cast_or_null<Instruction>(R->getUnderlyingValue())),
RdxDesc(R->getRecurrenceDescriptor()), IsOrdered(R->isOrdered()) {
VPValue *CondOp = R->getCondOp();
if (CondOp)
>From af3e8a5269b31bde2b64b65bee670b889f2d230f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 3 May 2024 01:34:57 -0700
Subject: [PATCH 15/16] Implement mayWriteToMemory, mayReadFromMemory and
mayHaveSideEffects
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7039ce2c0e1fba..5d116b9a327f77 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -59,6 +59,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPPredInstPHISC:
return false;
case VPBlendSC:
+ case VPReductionEVLSC:
case VPReductionSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -97,6 +98,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenStoreSC:
return false;
case VPBlendSC:
+ case VPReductionEVLSC:
case VPReductionSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -140,6 +142,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayHaveSideEffects();
case VPBlendSC:
+ case VPReductionEVLSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPWidenCanonicalIVSC:
>From afdb103b4ea777250c7171760dd74267b7f0a610 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 3 May 2024 16:40:09 +0800
Subject: [PATCH 16/16] Update llvm/lib/IR/VectorBuilder.cpp
Fix lambda format
Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
llvm/lib/IR/VectorBuilder.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index 3bdedb8ffb2b5f..0514082d56abc1 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -63,7 +63,7 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
ArrayRef<Value *> InstOpArray,
const Twine &Name) {
- auto getForRecurKind = [](RecurKind Kind) {
+ auto GetForRecurKind = [](RecurKind Kind) {
switch (Kind) {
case RecurKind::Add:
return Intrinsic::vp_reduce_add;
@@ -96,7 +96,7 @@ Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
return Intrinsic::not_intrinsic;
}
};
- auto VPID = getForRecurKind(Kind);
+ auto VPID = GetForRecurKind(Kind);
if (VPID == Intrinsic::not_intrinsic)
return returnWithError<Value *>("No VPIntrinsic for this reduction");
return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
More information about the llvm-commits
mailing list