[llvm] Reland [VPlan] Consolidate logic for narrowToSingleScalars (PR #171426)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 03:59:26 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Ramkumar Ramachandra (artagnon)
<details>
<summary>Changes</summary>
Changes: In the previous iteration of this patch (7b3ec5191a70), there was a transform-ordering problem due to which the insert-point after the first-non-PHI in legalizeAndOptimizeInductions didn't work as expected. Fix this by making a call to narrowToSingleScalars before legalizeAndOptimizeInductions as well, resulting in this patch making functional changes.
The logic for narrowing to single scalar recipes is in two different places: narrowToSingleScalarRecipes and legalizeAndOptimizeInductions. Consolidate them.
---
Patch is 36.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171426.diff
12 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+6-27)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll (+6-6)
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll (+27-15)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll (+3-3)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (+4-4)
- (modified) llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll (+24-6)
- (modified) llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll (+12-18)
- (modified) llvm/test/Transforms/LoopVectorize/X86/pr36524.ll (+4-7)
- (modified) llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll (+6-5)
- (modified) llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll (+6-4)
- (added) llvm/test/Transforms/LoopVectorize/pointer-induction-legalize-multiple.ll (+96)
- (modified) llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll (+6-6)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2242f95e4226c..27bebcf23d4e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -757,31 +757,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
if (!PhiR)
continue;
- // Try to narrow wide and replicating recipes to uniform recipes, based on
- // VPlan analysis.
- // TODO: Apply to all recipes in the future, to replace legacy uniformity
- // analysis.
- auto Users = collectUsersRecursively(PhiR);
- for (VPUser *U : reverse(Users)) {
- auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
- auto *RepR = dyn_cast<VPReplicateRecipe>(U);
- // Skip recipes that shouldn't be narrowed.
- if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
- Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
- (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
- continue;
-
- // Skip recipes that may have other lanes than their first used.
- if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))
- continue;
-
- auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
- Def->operands(), /*IsUniform*/ true,
- /*Mask*/ nullptr, /*Flags*/ *Def);
- Clone->insertAfter(Def);
- Def->replaceAllUsesWith(Clone);
- }
-
// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
@@ -1541,8 +1516,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
continue;
}
- // Skip recipes that aren't single scalars.
- if (!vputils::isSingleScalar(RepOrWidenR))
+ // Skip recipes that aren't single scalars and don't just have their first
+ // lane used.
+ if (!vputils::isSingleScalar(RepOrWidenR) &&
+ (!vputils::onlyFirstLaneUsed(RepOrWidenR) ||
+ RepOrWidenR->getNumUsers() == 0))
continue;
// Skip recipes for which conversion to single-scalar does introduce
@@ -2535,6 +2513,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
runPass(simplifyRecipes, Plan);
runPass(removeDeadRecipes, Plan);
runPass(simplifyBlends, Plan);
+ runPass(narrowToSingleScalarRecipes, Plan);
runPass(legalizeAndOptimizeInductions, Plan);
runPass(narrowToSingleScalarRecipes, Plan);
runPass(removeRedundantExpandSCEVRecipes, Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index f2c0ca30a6c18..8df88d60eeec8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -67,10 +67,10 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; VSCALEFORTUNING2-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i32> [[TMP9]], [[TMP10]]
; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = or i32 [[Z]], [[X]]
-; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
+; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and i32 [[TMP16]], 1
+; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP13]], i64 0
; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and <vscale x 4 x i32> [[TMP12]], splat (i32 1)
-; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP13]], splat (i32 1)
+; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64>
; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP15]]
; VSCALEFORTUNING2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
@@ -194,10 +194,10 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[TMP14:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; PRED-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i32> [[TMP13]], [[TMP14]]
; PRED-NEXT: [[TMP20:%.*]] = or i32 [[Z]], [[X]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
+; PRED-NEXT: [[TMP17:%.*]] = and i32 [[TMP20]], 1
+; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
; PRED-NEXT: [[TMP16:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP17:%.*]] = and <vscale x 4 x i32> [[TMP16]], splat (i32 1)
-; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP17]], splat (i32 1)
+; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP16]], splat (i32 1)
; PRED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64>
; PRED-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP19]]
; PRED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll
index 34b8deaa8de03..a7067659a0d81 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll
@@ -15,7 +15,7 @@ define void @test_iv_trunc_crash(ptr %a, ptr %b, i32 %n) {
; CHECK-NEXT: [[SMAX1:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[SMAX1]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add nuw i32 [[TMP2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP3]], 8
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP3]], 20
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[N]] to i64
@@ -27,34 +27,46 @@ define void @test_iv_trunc_crash(ptr %a, ptr %b, i32 %n) {
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP3]], 8
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP3]], 16
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 8, i32 [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 16, i32 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP3]], [[TMP11]]
-; CHECK-NEXT: [[DOTCAST:%.*]] = sitofp i32 [[N_VEC]] to double
-; CHECK-NEXT: [[TMP12:%.*]] = fmul reassoc double [[X]], [[DOTCAST]]
-; CHECK-NEXT: [[TMP13:%.*]] = fadd reassoc double [[SUM_0]], [[TMP12]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[X]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[DOTCAST2:%.*]] = sitofp i32 [[N_VEC]] to double
+; CHECK-NEXT: [[TMP14:%.*]] = fmul reassoc double [[X]], [[DOTCAST2]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = fadd reassoc double [[SUM_0]], [[TMP14]]
+; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc <2 x double> splat (double 2.000000e+00), [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x double> poison, double [[SUM_0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = fmul reassoc <2 x double> <double 0.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[INDUCTION:%.*]] = fadd reassoc <2 x double> [[BROADCAST_SPLAT3]], [[TMP15]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[DOTCAST2:%.*]] = sitofp i32 [[INDEX]] to double
-; CHECK-NEXT: [[TMP14:%.*]] = fmul reassoc double [[X]], [[DOTCAST2]]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = fadd reassoc double [[SUM_0]], [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = fmul reassoc double 7.000000e+00, [[X]]
-; CHECK-NEXT: [[TMP16:%.*]] = fadd reassoc double [[OFFSET_IDX]], [[TMP15]]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x double> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = fadd reassoc <2 x double> [[VEC_IND]], [[TMP18]]
+; CHECK-NEXT: [[STEP_ADD_2:%.*]] = fadd reassoc <2 x double> [[STEP_ADD]], [[TMP18]]
+; CHECK-NEXT: [[STEP_ADD_3:%.*]] = fadd reassoc <2 x double> [[STEP_ADD_2]], [[TMP18]]
+; CHECK-NEXT: [[STEP_ADD_4:%.*]] = fadd reassoc <2 x double> [[STEP_ADD_3]], [[TMP18]]
+; CHECK-NEXT: [[STEP_ADD_5:%.*]] = fadd reassoc <2 x double> [[STEP_ADD_4]], [[TMP18]]
+; CHECK-NEXT: [[STEP_ADD_6:%.*]] = fadd reassoc <2 x double> [[STEP_ADD_5]], [[TMP18]]
+; CHECK-NEXT: [[STEP_ADD_7:%.*]] = fadd reassoc <2 x double> [[STEP_ADD_6]], [[TMP18]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[STEP_ADD_7]], i32 1
; CHECK-NEXT: store double [[TMP16]], ptr [[B]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT: [[VEC_IND_NEXT]] = fadd reassoc <2 x double> [[STEP_ADD_7]], [[TMP18]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi double [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[SUM_0]], %[[ENTRY]] ], [ [[SUM_0]], %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi double [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[SUM_0]], %[[ENTRY]] ], [ [[SUM_0]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], %[[LOOP_BODY:.*]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP_BODY]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP_BODY]] ]
; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[I]], [[N]]
; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP_BODY]]
; CHECK: [[LOOP_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 0723f16677090..536ff1723dbe8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -10,10 +10,10 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[A]], 48
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP0]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[TMP0]], 52
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
-; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT2]] to <vscale x 2 x i32>
; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 55e7018c49eec..8cdc27215f0d9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -602,16 +602,16 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
; FIXEDLEN: [[VECTOR_BODY]]:
; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 5
-; FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 6
-; FIXEDLEN-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 7
+; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; FIXEDLEN-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[STEP_ADD]], i32 3
; FIXEDLEN-NEXT: store i64 [[TMP4]], ptr [[B]], align 8
; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 4
; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
; FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXEDLEN-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; FIXEDLEN: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index eba5a89939f78..e6a2a7c2dbe3a 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -4,15 +4,27 @@
define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
; CHECK-LABEL: define i32 @main(
; CHECK-SAME: i32 [[ARG:%.*]], ptr readnone captures(none) [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, align 1
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STOREMERGE_I_I:%.*]] = extractelement <8 x i8> [[VEC_IND]], i32 7
+; CHECK-NEXT: store i8 [[STOREMERGE_I_I]], ptr [[TMP0]], align 2
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <8 x i8> [[VEC_IND]], splat (i8 8)
+; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[STOREMERGE_I_I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[TMP12_I_I:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: store i8 [[STOREMERGE_I_I]], ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP8_I_I:%.*]] = icmp ult i8 [[STOREMERGE_I_I]], 8
-; CHECK-NEXT: [[TMP12_I_I]] = add nuw nsw i8 [[STOREMERGE_I_I]], 1
-; CHECK-NEXT: br i1 [[TMP8_I_I]], label %[[LOOP]], label %[[RET:.*]]
+; CHECK-NEXT: [[STOREMERGE_I_I1:%.*]] = phi i8 [ 8, %[[SCALAR_PH]] ], [ [[TMP12_I_I:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: store i8 [[STOREMERGE_I_I1]], ptr [[TMP0]], align 2
+; CHECK-NEXT: [[TMP8_I_I:%.*]] = icmp ult i8 [[STOREMERGE_I_I1]], 8
+; CHECK-NEXT: [[TMP12_I_I]] = add nuw nsw i8 [[STOREMERGE_I_I1]], 1
+; CHECK-NEXT: br i1 [[TMP8_I_I]], label %[[LOOP]], label %[[RET:.*]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[RET]]:
; CHECK-NEXT: ret i32 0
;
@@ -33,3 +45,9 @@ ret:
attributes #0 = { "target-cpu"="z13" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 651e2ad5e74da..cc9ea076b1609 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -865,12 +865,9 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[CMP_I]], <32 x i32> zeroinitializer, <32 x i32> poison
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <32 x i32> [[PREDPHI]], i32 0
; CHECK-NEXT: [[TMP36:%.*]] = ashr i32 [[CONV5_I]], [[TMP35]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <32 x i32> poison, i32 [[TMP36]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT64:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT63]], <32 x i32> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP37:%.*]] = icmp eq <32 x i32> [[BROADCAST_SPLAT64]], zeroinitializer
+; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[TMP36]], 0
; CHECK-NEXT: [[TMP38:%.*]] = shl <32 x i32> [[PREDPHI]], splat (i32 24)
; CHECK-NEXT: [[TMP39:%.*]] = ashr exact <32 x i32> [[TMP38]], splat (i32 24)
-; CHECK-NEXT: [[TMP40:%.*]] = extractelement <32 x i1> [[TMP37]], i32 0
; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], <32 x i32> [[TMP39]], <32 x i32> zeroinitializer
; CHECK-NEXT: [[PREDPHI65:%.*]] = select <32 x i1> [[TMP34]], <32 x i32> [[TMP41]], <32 x i32> zeroinitializer
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
@@ -891,35 +888,32 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
; CHECK-NEXT: [[INDEX68:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT81:%.*]], [[PRED_UREM_CONTINUE76:%.*]] ]
; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP44]], i32 0
; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_UREM_IF69:%.*]], label [[PRED_UREM_CONTINUE70:%.*]]
-; CHECK: pred.urem.if69:
+; CHECK: pred.urem.if67:
; CHECK-NEXT: br label [[PRED_UREM_CONTINUE70]]
+; CHECK: pred.urem.continue68:
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP44]], i32 1
+; CHECK-NEXT: br i1 [[TMP48]], label [[PRED_UREM_IF70:%.*]], label [[PRED_UREM_CONTINUE71:%.*]]
+; CHECK: pred.urem.if69:
+; CHECK-NEXT: br label [[PRED_URE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/171426
More information about the llvm-commits
mailing list