[llvm] 4e69258 - [LoopVectorize] Add cost of generating tail-folding mask to the loop (#130565)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 21 02:24:59 PDT 2025
Author: David Sherwood
Date: 2025-03-21T09:24:56Z
New Revision: 4e69258bf390b51ed5bc65e7cea4cbcfbb62bd44
URL: https://github.com/llvm/llvm-project/commit/4e69258bf390b51ed5bc65e7cea4cbcfbb62bd44
DIFF: https://github.com/llvm/llvm-project/commit/4e69258bf390b51ed5bc65e7cea4cbcfbb62bd44.diff
LOG: [LoopVectorize] Add cost of generating tail-folding mask to the loop (#130565)
At the moment if we decide to enable tail-folding we do not include
the cost of generating the mask per VF. This can mean we make some
poor choices of VF, which is definitely true for SVE-enabled AArch64
targets where mask generation for fixed-width vectors is more
expensive than for scalable vectors.
I've added a VPInstruction::computeCost function to return the costs
of the ActiveLaneMask and ExplicitVectorLength operations.
Unfortunately, in order to prevent asserts firing I've also had to
duplicate the same code in the legacy cost model to make sure the
chosen VFs match up. I've wrapped this up in a ifndef NDEBUG for
now. The alternative would be to disable the assert completely when
tail-folding, which I imagine is just as bad.
New tests added:
Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
Added:
llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0d2d5e1c312a1..dbce166bf220c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4603,8 +4603,31 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
continue;
InstructionCost C = CM.expectedCost(VF);
- VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
+ // Add on other costs that are modelled in VPlan, but not in the legacy
+ // cost model.
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
+ CM, CM.CostKind);
+ VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
+ assert(VectorRegion && "Expected to have a vector region!");
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(VectorRegion->getEntry()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (!VPI)
+ continue;
+ switch (VPI->getOpcode()) {
+ case VPInstruction::ActiveLaneMask:
+ case VPInstruction::ExplicitVectorLength:
+ C += VPI->cost(VF, CostCtx);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned Width =
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b40f7e6f99068..16d353b53b3ef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -801,6 +801,21 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
cast<VectorType>(VectorTy), Mask,
Ctx.CostKind, VF.getKnownMinValue() - 1);
}
+ case VPInstruction::ActiveLaneMask: {
+ Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
+ Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
+ {ArgTy, ArgTy});
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+ }
+ case VPInstruction::ExplicitVectorLength: {
+ Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
+ Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
+ Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
+ IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
+ I32Ty, {Arg0Ty, I32Ty, I1Ty});
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+ }
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index a275d77c38fb4..38f28fc7dde42 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -498,9 +498,9 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
-; DEFAULT-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 2
-; DEFAULT-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 4
-; DEFAULT-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP5]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2
+; DEFAULT-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; DEFAULT-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]]
; DEFAULT-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
; DEFAULT-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
; DEFAULT-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
@@ -613,112 +613,9 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; PRED-LABEL: define i32 @header_mask_and_invariant_compare(
; PRED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; PRED-NEXT: [[ENTRY:.*]]:
-; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
-; PRED: [[VECTOR_MEMCHECK]]:
-; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
-; PRED-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2
-; PRED-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4
-; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]]
-; PRED-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
-; PRED-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
-; PRED-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
-; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
-; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; PRED-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
-; PRED-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
-; PRED-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
-; PRED-NEXT: [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
-; PRED-NEXT: [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; PRED-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
-; PRED-NEXT: [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
-; PRED-NEXT: [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
-; PRED-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; PRED-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
-; PRED-NEXT: [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
-; PRED-NEXT: [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
-; PRED-NEXT: [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
-; PRED-NEXT: [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
-; PRED-NEXT: [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; PRED-NEXT: [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
-; PRED-NEXT: [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
-; PRED-NEXT: [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
-; PRED-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; PRED-NEXT: [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
-; PRED-NEXT: br i1 [[CONFLICT_RDX27]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; PRED: [[VECTOR_PH]]:
-; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT: [[TMP12:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
-; PRED: [[VECTOR_BODY]]:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE37]] ]
-; PRED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT: [[TMP7:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META4:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META7:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP9:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; PRED-NEXT: [[TMP10:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META9:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP11:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP9]]
-; PRED-NEXT: [[TMP25:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
-; PRED-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]]
-; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP25]], i32 0
-; PRED-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED: [[PRED_STORE_IF]]:
-; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0
-; PRED-NEXT: store i32 [[TMP27]], ptr [[E]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; PRED: [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP25]], i32 1
-; PRED-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
-; PRED: [[PRED_STORE_IF32]]:
-; PRED-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1
-; PRED-NEXT: store i32 [[TMP17]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE33]]
-; PRED: [[PRED_STORE_CONTINUE33]]:
-; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP25]], i32 2
-; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]]
-; PRED: [[PRED_STORE_IF34]]:
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2
-; PRED-NEXT: store i32 [[TMP19]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE35]]
-; PRED: [[PRED_STORE_CONTINUE35]]:
-; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP25]], i32 3
-; PRED-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]]
-; PRED: [[PRED_STORE_IF36]]:
-; PRED-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP9]], i32 3
-; PRED-NEXT: store i32 [[TMP21]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE37]]
-; PRED: [[PRED_STORE_CONTINUE37]]:
-; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
-; PRED-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <4 x i1> [[TMP25]]), !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; PRED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP28]], i32 0
-; PRED-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; PRED: [[MIDDLE_BLOCK]]:
-; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; PRED: [[SCALAR_PH]]:
-; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; PRED-NEXT: br label %[[LOOP_HEADER:.*]]
; PRED: [[LOOP_HEADER]]:
-; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; PRED-NEXT: [[L_A:%.*]] = load i32, ptr [[A]], align 4
; PRED-NEXT: [[L_B:%.*]] = load i32, ptr [[B]], align 4
; PRED-NEXT: [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
@@ -733,7 +630,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; PRED: [[LOOP_LATCH]]:
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT: br i1 [[C_1]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]]
+; PRED-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
; PRED: [[EXIT]]:
; PRED-NEXT: ret i32 0
;
@@ -848,7 +745,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
-; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; PRED: [[SCALAR_PH]]:
@@ -866,7 +763,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
; PRED-NEXT: [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512
-; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; PRED: [[EXIT]]:
; PRED-NEXT: ret void
;
@@ -1078,7 +975,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED: [[PRED_STORE_CONTINUE14]]:
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PRED-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; PRED: [[SCALAR_PH]]:
@@ -1091,7 +988,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
-; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; PRED: [[EXIT]]:
; PRED-NEXT: ret void
;
@@ -1116,7 +1013,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
; DEFAULT-NEXT: [[ENTRY:.*]]:
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; DEFAULT: [[VECTOR_SCEVCHECK]]:
; DEFAULT-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
@@ -1126,14 +1023,14 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT]]
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[DST]]
; DEFAULT-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; DEFAULT-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 4
-; DEFAULT-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
-; DEFAULT-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
-; DEFAULT-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
-; DEFAULT-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
-; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
-; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
-; DEFAULT-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; DEFAULT-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
+; DEFAULT-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
+; DEFAULT-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT2]]
+; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT2]]
+; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP]]
+; DEFAULT-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW3]]
; DEFAULT-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 8
; DEFAULT-NEXT: [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
; DEFAULT-NEXT: [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
@@ -1340,14 +1237,14 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT]]
; PRED-NEXT: [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[DST]]
; PRED-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 4
-; PRED-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
-; PRED-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
-; PRED-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
-; PRED-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
-; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
-; PRED-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
-; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; PRED-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
+; PRED-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
+; PRED-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT2]]
+; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT2]]
+; PRED-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP]]
+; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW3]]
; PRED-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 8
; PRED-NEXT: [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
; PRED-NEXT: [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
@@ -1515,7 +1412,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0
-; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; PRED: [[SCALAR_PH]]:
@@ -1545,7 +1442,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED: [[LOOP_LATCH]]:
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
; PRED: [[EXIT]]:
; PRED-NEXT: ret void
;
@@ -1578,7 +1475,7 @@ if.then:
loop.latch:
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, %N
- br i1 %ec, label %exit, label %loop.header
+ br i1 %ec, label %exit, label %loop.header, !llvm.loop !0
exit:
ret void
@@ -1691,7 +1588,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; PRED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; PRED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; PRED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; PRED: [[SCALAR_PH]]:
@@ -1707,7 +1604,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
; PRED-NEXT: [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32
; PRED-NEXT: store i32 [[T]], ptr [[DST]], align 4
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21
-; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
; PRED: [[EXIT]]:
; PRED-NEXT: ret void
;
@@ -1738,3 +1635,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
attributes #1 = { "target-cpu"="neoverse-512tvb" }
attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.vectorize.width", i32 8}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 850349c5d3c0b..d349be1f6e70a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -99,49 +99,49 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; PRED: [[VECTOR_MEMCHECK]]:
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 16
; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; PRED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[TMP16:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
+; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT: [[TMP16:%.*]] = trunc <vscale x 16 x i32> [[BROADCAST_SPLAT]] to <vscale x 16 x i16>
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
; PRED: [[VECTOR_BODY]]:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0
; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
; PRED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
-; PRED-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
-; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
-; PRED-NEXT: [[TMP23:%.*]] = or <vscale x 8 x i16> [[TMP21]], [[TMP22]]
-; PRED-NEXT: [[TMP24:%.*]] = lshr <vscale x 8 x i16> [[TMP23]], trunc (<vscale x 8 x i32> splat (i32 1) to <vscale x 8 x i16>)
-; PRED-NEXT: [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
+; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
+; PRED-NEXT: [[TMP25:%.*]] = mul <vscale x 16 x i16> [[TMP24]], [[TMP16]]
+; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
+; PRED-NEXT: [[TMP21:%.*]] = or <vscale x 16 x i16> [[TMP25]], [[TMP20]]
+; PRED-NEXT: [[TMP22:%.*]] = lshr <vscale x 16 x i16> [[TMP21]], trunc (<vscale x 16 x i32> splat (i32 1) to <vscale x 16 x i16>)
+; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP22]] to <vscale x 16 x i8>
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
new file mode 100644
index 0000000000000..121a6ed53309e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -0,0 +1,211 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=TFNONE
+; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TFCOMMON,TFALWAYS
+; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TFCOMMON,TFFALLBACK
+; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TFA_INTERLEAVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
+; TFNONE-LABEL: @test_widen_exp_v2(
+; TFNONE-NEXT: entry:
+; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; TFNONE: vector.ph:
+; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]
+; TFNONE: vector.body:
+; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
+; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
+; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFNONE-NEXT: store double [[TMP14]], ptr [[P:%.*]], align 8
+; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TFNONE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TFNONE: middle.block:
+; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; TFNONE-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; TFNONE: scalar.ph:
+; TFNONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; TFNONE-NEXT: br label [[LOOP:%.*]]
+; TFNONE: loop:
+; TFNONE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_END:%.*]] ]
+; TFNONE-NEXT: [[LD:%.*]] = load double, ptr [[P2]], align 8
+; TFNONE-NEXT: [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR2:[0-9]+]]
+; TFNONE-NEXT: [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
+; TFNONE-NEXT: br i1 [[COND1]], label [[LOOP_MIDDLE:%.*]], label [[LOOP_END]]
+; TFNONE: loop.middle:
+; TFNONE-NEXT: br label [[LOOP_END]]
+; TFNONE: loop.end:
+; TFNONE-NEXT: [[SINK:%.*]] = phi double [ 0.000000e+00, [[LOOP_MIDDLE]] ], [ 1.000000e+00, [[LOOP]] ]
+; TFNONE-NEXT: store double [[SINK]], ptr [[P]], align 8
+; TFNONE-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; TFNONE-NEXT: [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]]
+; TFNONE-NEXT: br i1 [[COND2]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; TFNONE: end:
+; TFNONE-NEXT: ret void
+;
+; TFCOMMON-LABEL: @test_widen_exp_v2(
+; TFCOMMON-NEXT: entry:
+; TFCOMMON-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFCOMMON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2
+; TFCOMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
+; TFCOMMON-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
+; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; TFCOMMON-NEXT: br label [[LOOP:%.*]]
+; TFCOMMON: vector.body:
+; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFCOMMON-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR3:[0-9]+]]
+; TFCOMMON-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR3]]
+; TFCOMMON-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; TFCOMMON-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
+; TFCOMMON-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
+; TFCOMMON-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
+; TFCOMMON-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
+; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFCOMMON-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; TFCOMMON-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TFCOMMON: pred.store.if:
+; TFCOMMON-NEXT: [[SINK:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
+; TFCOMMON-NEXT: store double [[SINK]], ptr [[P:%.*]], align 8
+; TFCOMMON-NEXT: br label [[PRED_STORE_CONTINUE]]
+; TFCOMMON: pred.store.continue:
+; TFCOMMON-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; TFCOMMON-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE6]]
+; TFCOMMON: pred.store.if1:
+; TFCOMMON-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFCOMMON-NEXT: store double [[TMP19]], ptr [[P]], align 8
+; TFCOMMON-NEXT: br label [[PRED_STORE_CONTINUE6]]
+; TFCOMMON: pred.store.continue2:
+; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
+; TFCOMMON-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFCOMMON-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP18]], i32 0
+; TFCOMMON-NEXT: br i1 [[TMP17]], label [[END:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; TFCOMMON: end:
+; TFCOMMON-NEXT: ret void
+;
+; TFA_INTERLEAVE-LABEL: @test_widen_exp_v2(
+; TFA_INTERLEAVE-NEXT: entry:
+; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0:%.*]], 1
+; TFA_INTERLEAVE-NEXT: [[N_RND_UP1:%.*]] = add i64 [[N_RND_UP]], 3
+; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP1]], 4
+; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP1]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[N_RND_UP]], 4
+; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N_RND_UP]], 4
+; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[N_RND_UP]])
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 2, i64 [[N_RND_UP]])
+; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE: vector.body:
+; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3:[0-9]+]]
+; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]]
+; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
+; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]]
+; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]]
+; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
+; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1
+; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP13]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP18]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TFA_INTERLEAVE: pred.store.if:
+; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
+; TFA_INTERLEAVE-NEXT: store double [[TMP20]], ptr [[P:%.*]], align 8
+; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]]
+; TFA_INTERLEAVE: pred.store.continue:
+; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; TFA_INTERLEAVE: pred.store.if4:
+; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFA_INTERLEAVE-NEXT: store double [[TMP22]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE5]]
+; TFA_INTERLEAVE: pred.store.continue5:
+; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 0
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; TFA_INTERLEAVE: pred.store.if6:
+; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
+; TFA_INTERLEAVE-NEXT: store double [[TMP32]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE7]]
+; TFA_INTERLEAVE: pred.store.continue7:
+; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
+; TFA_INTERLEAVE: pred.store.if8:
+; TFA_INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
+; TFA_INTERLEAVE-NEXT: store double [[TMP34]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE9]]
+; TFA_INTERLEAVE: pred.store.continue9:
+; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP27]], i64 [[TMP3]])
+; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP30]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TFA_INTERLEAVE: end:
+; TFA_INTERLEAVE-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.end ]
+ %ld = load double, ptr %p2, align 8
+ %exp = tail call double @llvm.exp.f64(double %ld) #6
+ %cond1 = fcmp ogt double %exp, 0.000000e+00
+ br i1 %cond1, label %loop.middle, label %loop.end
+
+loop.middle:
+ br label %loop.end
+
+loop.end:
+ %sink = phi double [ 0.000000e+00, %loop.middle ], [ 1.000000e+00, %loop ]
+ store double %sink, ptr %p, align 8
+ %iv.next = add i64 %iv, 1
+ %cond2 = icmp eq i64 %iv, %n
+ br i1 %cond2, label %end, label %loop, !llvm.loop !0
+
+end:
+ ret void
+}
+
+
+declare double @llvm.exp.f64(double)
+
+; fixed-width variant of exp
+declare <2 x double> @exp_fixed(<2 x double>)
+
+;; scalable vector variant of exp
+declare <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double>, <vscale x 2 x i1>)
+
+attributes #5 = { "target-cpu"="neoverse-v2" vscale_range(1,16) }
+attributes #6 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.exp.f64(exp_fixed),_ZGVsMxv_llvm.exp.f64(exp_masked_scalable)" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 2}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; TFALWAYS: {{.*}}
+; TFFALLBACK: {{.*}}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 99544ac63a466..36c3a4a2b4e43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -1040,167 +1040,65 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE: [[END]]:
; TFNONE-NEXT: ret void
;
-; TFALWAYS-LABEL: define void @test_widen_exp_v2(
-; TFALWAYS-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
-; TFALWAYS-NEXT: [[ENTRY:.*]]:
-; TFALWAYS-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; TFALWAYS-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
-; TFALWAYS-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; TFALWAYS-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFALWAYS-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2
-; TFALWAYS-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
-; TFALWAYS-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]]
-; TFALWAYS: [[VECTOR_BODY]]:
-; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
-; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; TFALWAYS-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8
-; TFALWAYS-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
-; TFALWAYS-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFALWAYS-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; TFALWAYS-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
-; TFALWAYS-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
-; TFALWAYS-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
-; TFALWAYS-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
-; TFALWAYS-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; TFALWAYS-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; TFALWAYS: [[PRED_STORE_IF]]:
-; TFALWAYS-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
-; TFALWAYS-NEXT: store double [[TMP13]], ptr [[P]], align 8
-; TFALWAYS-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; TFALWAYS: [[PRED_STORE_CONTINUE]]:
-; TFALWAYS-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; TFALWAYS-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; TFALWAYS: [[PRED_STORE_IF1]]:
-; TFALWAYS-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
-; TFALWAYS-NEXT: store double [[TMP15]], ptr [[P]], align 8
-; TFALWAYS-NEXT: br label %[[PRED_STORE_CONTINUE2]]
-; TFALWAYS: [[PRED_STORE_CONTINUE2]]:
-; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
-; TFALWAYS-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; TFALWAYS-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
-; TFALWAYS-NEXT: br i1 [[TMP17]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; TFALWAYS: [[END]]:
-; TFALWAYS-NEXT: ret void
-;
-; TFFALLBACK-LABEL: define void @test_widen_exp_v2(
-; TFFALLBACK-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
-; TFFALLBACK-NEXT: [[ENTRY:.*]]:
-; TFFALLBACK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; TFFALLBACK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
-; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFFALLBACK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2
-; TFFALLBACK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
-; TFFALLBACK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]]
-; TFFALLBACK: [[VECTOR_BODY]]:
-; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
-; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; TFFALLBACK-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8
-; TFFALLBACK-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
-; TFFALLBACK-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFFALLBACK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; TFFALLBACK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
-; TFFALLBACK-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
-; TFFALLBACK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
-; TFFALLBACK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
-; TFFALLBACK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; TFFALLBACK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; TFFALLBACK: [[PRED_STORE_IF]]:
-; TFFALLBACK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
-; TFFALLBACK-NEXT: store double [[TMP13]], ptr [[P]], align 8
-; TFFALLBACK-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; TFFALLBACK: [[PRED_STORE_CONTINUE]]:
-; TFFALLBACK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; TFFALLBACK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; TFFALLBACK: [[PRED_STORE_IF1]]:
-; TFFALLBACK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
-; TFFALLBACK-NEXT: store double [[TMP15]], ptr [[P]], align 8
-; TFFALLBACK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
-; TFFALLBACK: [[PRED_STORE_CONTINUE2]]:
-; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
-; TFFALLBACK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; TFFALLBACK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
-; TFFALLBACK-NEXT: br i1 [[TMP17]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; TFFALLBACK: [[END]]:
-; TFFALLBACK-NEXT: ret void
+; TFCOMMON-LABEL: define void @test_widen_exp_v2(
+; TFCOMMON-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; TFCOMMON-NEXT: [[ENTRY:.*]]:
+; TFCOMMON-NEXT: br label %[[LOOP:.*]]
+; TFCOMMON: [[LOOP]]:
+; TFCOMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2]], align 8
+; TFCOMMON-NEXT: [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]]
+; TFCOMMON-NEXT: [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
+; TFCOMMON-NEXT: [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00
+; TFCOMMON-NEXT: store double [[SINK]], ptr [[P]], align 8
+; TFCOMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; TFCOMMON-NEXT: [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]]
+; TFCOMMON-NEXT: br i1 [[COND2]], label %[[END:.*]], label %[[LOOP]]
+; TFCOMMON: [[END]]:
+; TFCOMMON-NEXT: ret void
;
; TFA_INTERLEAVE-LABEL: define void @test_widen_exp_v2(
; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]:
; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 4
-; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2
+; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 2, i64 [[TMP0]])
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[TMP0]]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]]
; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]]
; TFA_INTERLEAVE: [[VECTOR_BODY]]:
-; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE9:.*]] ]
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE9]] ]
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], %[[PRED_STORE_CONTINUE9]] ]
+; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP27:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[PRED_STORE_CONTINUE5]] ]
; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8
-; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
-; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
-; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
-; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1
-; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP13]], splat (i1 true)
-; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], splat (i1 true)
-; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP18]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; TFA_INTERLEAVE-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; TFA_INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; TFA_INTERLEAVE: [[PRED_STORE_IF]]:
-; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
-; TFA_INTERLEAVE-NEXT: store double [[TMP20]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
+; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = xor i1 [[TMP6]], true
+; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP7]], double 1.000000e+00, double 0.000000e+00
+; TFA_INTERLEAVE-NEXT: store double [[TMP24]], ptr [[P]], align 8
; TFA_INTERLEAVE-NEXT: br label %[[PRED_STORE_CONTINUE]]
; TFA_INTERLEAVE: [[PRED_STORE_CONTINUE]]:
-; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
+; TFA_INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
; TFA_INTERLEAVE: [[PRED_STORE_IF4]]:
-; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
-; TFA_INTERLEAVE-NEXT: store double [[TMP22]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
+; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = fcmp ogt double [[TMP8]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = xor i1 [[TMP9]], true
+; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = select i1 [[TMP10]], double 1.000000e+00, double 0.000000e+00
+; TFA_INTERLEAVE-NEXT: store double [[TMP26]], ptr [[P]], align 8
; TFA_INTERLEAVE-NEXT: br label %[[PRED_STORE_CONTINUE5]]
; TFA_INTERLEAVE: [[PRED_STORE_CONTINUE5]]:
-; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 0
-; TFA_INTERLEAVE-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
-; TFA_INTERLEAVE: [[PRED_STORE_IF6]]:
-; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
-; TFA_INTERLEAVE-NEXT: store double [[TMP24]], ptr [[P]], align 8
-; TFA_INTERLEAVE-NEXT: br label %[[PRED_STORE_CONTINUE7]]
-; TFA_INTERLEAVE: [[PRED_STORE_CONTINUE7]]:
-; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1
-; TFA_INTERLEAVE-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9]]
-; TFA_INTERLEAVE: [[PRED_STORE_IF8]]:
-; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
-; TFA_INTERLEAVE-NEXT: store double [[TMP26]], ptr [[P]], align 8
-; TFA_INTERLEAVE-NEXT: br label %[[PRED_STORE_CONTINUE9]]
-; TFA_INTERLEAVE: [[PRED_STORE_CONTINUE9]]:
-; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP27]], i64 [[TMP3]])
-; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
-; TFA_INTERLEAVE-NEXT: br i1 [[TMP29]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TFA_INTERLEAVE-NEXT: [[TMP27]] = add i64 [[INDEX]], 2
+; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP11]], [[TMP3]]
+; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP12]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TFA_INTERLEAVE: [[END]]:
; TFA_INTERLEAVE-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index f4afd37f4881a..625a24dbf9568 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -467,44 +467,44 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
; PRED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP2]], 1
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
; PRED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
; PRED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
; PRED-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
; PRED-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
; PRED: [[VECTOR_BODY]]:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
; PRED-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
-; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
-; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
-; PRED-NEXT: [[TMP16]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> [[TMP20]], <vscale x 4 x i16> [[VEC_PHI]]
+; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
+; PRED-NEXT: [[TMP20:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; PRED-NEXT: [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP20]], [[VEC_PHI]]
+; PRED-NEXT: [[TMP16]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP21]], <vscale x 8 x i16> [[VEC_PHI]]
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
-; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[TMP18:%.*]] = extractelement <vscale x 8 x i1> [[TMP17]], i32 0
; PRED-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
-; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[TMP16]])
+; PRED-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP16]])
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; PRED: [[SCALAR_PH]]:
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; PRED-NEXT: br label %[[LOOP:.*]]
; PRED: [[LOOP]]:
; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -517,7 +517,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; PRED: [[EXIT]]:
-; PRED-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ]
+; PRED-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
; PRED-NEXT: ret i16 [[RED_NEXT_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
index af266653af803..c0c5c8a7da26e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
@@ -1,4 +1,7 @@
-; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue <%s | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -debug-only=loop-vectorize < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck --check-prefix=COST %s
target triple = "aarch64-unknown-linux-gnu"
@@ -32,4 +35,29 @@ for.end: ; preds = %for.body
ret i32 0
}
+; COST: LV: Checking a loop in 'simple_memset'
+; COST: Cost of 4 for VF 2: EMIT{{.*}}active lane mask
+; COST: Cost of 8 for VF 4: EMIT{{.*}}active lane mask
+; COST: Cost of Invalid for VF vscale x 1: EMIT{{.*}}active lane mask
+; COST: Cost of 1 for VF vscale x 2: EMIT{{.*}}active lane mask
+; COST: Cost of 1 for VF vscale x 4: EMIT{{.*}}active lane mask
+
+define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
+; CHECK-LABEL: @simple_memset(
+; CHECK: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>
+entry:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
+ %gep = getelementptr i32, ptr %ptr, i64 %index
+ store i32 %val, ptr %gep
+ %index.next = add nsw i64 %index, 1
+ %cmp10 = icmp ult i64 %index.next, %n
+ br i1 %cmp10, label %while.body, label %while.end.loopexit
+
+while.end.loopexit: ; preds = %while.body
+ ret void
+}
+
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
new file mode 100644
index 0000000000000..b4afdd7cf7d54
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
@@ -0,0 +1,34 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple riscv64-linux-gnu -force-tail-folding-style=data-with-evl -mattr=+v,+f -S \
+; RUN: -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=EVL
+
+; CHECK: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask
+; CHECK: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask
+; CHECK: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask
+; CHECK: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask
+; CHECK: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask
+; CHECK: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask
+
+; EVL: Cost of 1 for VF vscale x 1: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
+; EVL: Cost of 1 for VF vscale x 2: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
+; EVL: Cost of 1 for VF vscale x 4: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
+
+define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
+entry:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
+ %gep = getelementptr i32, ptr %ptr, i64 %index
+ store i32 %val, ptr %gep
+ %index.next = add nsw i64 %index, 1
+ %cmp10 = icmp ult i64 %index.next, %n
+ br i1 %cmp10, label %while.body, label %while.end.loopexit
+
+while.end.loopexit: ; preds = %while.body
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 983aded012e80..81ed685c7fe59 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -321,37 +321,37 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; SCALABLE-NEXT: [[ENTRY:.*]]:
; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; SCALABLE: [[VECTOR_PH]]:
; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; SCALABLE-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
-; SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
+; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; SCALABLE-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
+; SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
-; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; SCALABLE: [[VECTOR_BODY]]:
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10)
-; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> poison)
-; SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
+; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10)
+; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i64> poison)
+; SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT: store <vscale x 4 x i64> [[PREDPHI]], ptr [[TMP13]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
@@ -433,36 +433,36 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TF-SCALABLE: [[VECTOR_PH]]:
; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; TF-SCALABLE-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
+; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 1025)
-; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10)
-; TF-SCALABLE-NEXT: [[TMP11:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> poison)
-; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
+; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 1025)
+; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10)
+; TF-SCALABLE-NEXT: [[TMP10:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i64> poison)
+; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr [[TMP13]], i32 8, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
@@ -550,7 +550,7 @@ latch:
store i64 %phi, ptr %arrayidx
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1025
- br i1 %exitcond.not, label %for.end, label %for.body
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
for.end:
ret void
@@ -1544,3 +1544,7 @@ for.body:
for.end:
ret void
}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index d5b9f08b8300e..bcde5d357c0df 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -78,8 +78,13 @@ loop:
store i64 0, ptr %arrayidx13, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 3
- br i1 %exitcond.not, label %exit, label %loop
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
exit:
ret void
}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.vectorize.width", i32 2}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
More information about the llvm-commits
mailing list