[llvm] [LV][EVL] Fix the check for legality of folding with EVL. (PR #125678)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 4 04:56:51 PST 2025
https://github.com/Mel-Chen created https://github.com/llvm/llvm-project/pull/125678
The current legality check for folding with EVL has incomplete verification for VF.
This patch fixes the VF check, ensuring that tail folding with EVL is enabled only when a scalable VF is available. This allows loops that prefer tail folding with EVL but cannot use scalable VF vectorization to still be vectorized using a fixed VF, rather than abandoning vectorization entirely.
>From c5ad8e319efff9f98f304a06084b094f33f2f26c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 4 Feb 2025 00:51:19 -0800
Subject: [PATCH] [LV][EVL] Fix the check for legality of folding with EVL.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 13 +-
...ze-force-tail-with-evl-inloop-reduction.ll | 208 ++++++++++++++----
...vectorize-force-tail-with-evl-reduction.ll | 206 +++++++++++++----
3 files changed, 343 insertions(+), 84 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 660a6ef574576b0..bd65614c2918396 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1454,9 +1454,10 @@ class LoopVectorizationCostModel {
// FIXME: Investigate opportunity for fixed vector factor.
// FIXME: support fixed-order recurrences by fixing splice of non VFxUF
// penultimate EVL.
- bool EVLIsLegal =
- UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty();
+ bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+ !EnableVPlanNativePath &&
+ Legal->getFixedOrderRecurrences().empty();
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -4109,7 +4110,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
+ bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
+ setTailFoldingStyles(ContainsScalableVF, UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -4120,8 +4122,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// Tail folded loop using VP intrinsics restricts the VF to be scalable
// for now.
// TODO: extend it for fixed vectors, if required.
- assert(MaxFactors.ScalableVF.isScalable() &&
- "Expected scalable vector factor.");
+ assert(ContainsScalableVF && "Expected scalable vector factor.");
MaxFactors.FixedVF = ElementCount::getFixed(1);
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
index 46340dea4bf8c24..6e7f634ce945907 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
@@ -132,18 +132,49 @@ for.end:
define i32 @mul(ptr %a, i64 %n, i32 %start) {
; IF-EVL-LABEL: @mul(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison)
+; IF-EVL-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> splat (i32 1)
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]])
+; IF-EVL-NEXT: [[MUL]] = mul i32 [[TMP5]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4
+; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MUL]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX1:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[MUL1]] = mul nsw i32 [[TMP0]], [[RDX1]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[FOR_BODY1]] ], [ [[MUL]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]]
;
; NO-VP-LABEL: @mul(
@@ -1161,18 +1192,49 @@ for.end:
define float @fmul(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fmul(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison)
+; IF-EVL-NEXT: [[TMP4:%.*]] = select reassoc <4 x i1> [[TMP1]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> splat (float 1.000000e+00)
+; IF-EVL-NEXT: [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
+; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP5]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4
+; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[MUL]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[MUL1]] = fmul reassoc float [[TMP0]], [[RDX1]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP23:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL1]], [[FOR_BODY1]] ], [ [[MUL]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MUL_LCSSA]]
;
; NO-VP-LABEL: @fmul(
@@ -1273,7 +1335,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -1289,7 +1351,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
@@ -1395,7 +1457,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -1411,7 +1473,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
@@ -1485,18 +1547,51 @@ for.end:
define float @fminimum(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fminimum(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP29:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
;
; NO-VP-LABEL: @fminimum(
@@ -1566,18 +1661,51 @@ for.end:
define float @fmaximum(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fmaximum(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP31:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
;
; NO-VP-LABEL: @fmaximum(
@@ -1682,7 +1810,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -1699,7 +1827,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]])
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MULADD_LCSSA]]
@@ -1810,7 +1938,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]]
@@ -1829,7 +1957,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
@@ -1937,7 +2065,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]]
@@ -1956,7 +2084,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 7557c10892d6d46..b81805d4730020b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -133,18 +133,50 @@ for.end:
define i32 @mul(ptr %a, i64 %n, i32 %start) {
; IF-EVL-LABEL: @mul(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> splat (i32 1), i32 [[START:%.*]], i32 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ [[TMP9]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP1:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison)
+; IF-EVL-NEXT: [[TMP5]] = mul <8 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP6]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]]
;
; NO-VP-LABEL: @mul(
@@ -1202,18 +1234,50 @@ for.end:
define float @fmul(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fmul(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement <8 x float> splat (float 1.000000e+00), float [[START:%.*]], i32 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[TMP9]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP1:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP1]]
+; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP5]] = fmul reassoc <8 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = select reassoc <8 x i1> [[TMP2]], <8 x float> [[TMP5]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[TMP6]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP23:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MUL_LCSSA]]
;
; NO-VP-LABEL: @fmul(
@@ -1316,7 +1380,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP15]])
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1333,7 +1397,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP19]], float [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
@@ -1443,7 +1507,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[TMP15]])
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1460,7 +1524,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP19]], float [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
@@ -1536,18 +1600,51 @@ for.end:
define float @fminimum(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fminimum(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP29:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
;
; NO-VP-LABEL: @fminimum(
@@ -1617,18 +1714,51 @@ for.end:
define float @fmaximum(ptr %a, i64 %n, float %start) {
; IF-EVL-LABEL: @fmaximum(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[IV]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
-; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP31:![0-9]+]]
; IF-EVL: for.end:
-; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
;
; NO-VP-LABEL: @fmaximum(
@@ -1733,7 +1863,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP20:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP17]])
; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1751,7 +1881,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]])
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret float [[MULADD_LCSSA]]
@@ -1862,7 +1992,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
; IF-EVL-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]]
@@ -1881,7 +2011,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
@@ -1989,7 +2119,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
; IF-EVL-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]]
@@ -2008,7 +2138,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
More information about the llvm-commits
mailing list