[llvm] [VPlan] Simplify before covnertToDeadRecipes, remove SCALAR-STEPS. (PR #123655)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 09:49:25 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-backend-powerpc
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
After unrolling, there may be additional simplifications that can be applied. One example is removing SCALAR-STEPS for the first part where only the first lane is demanded.
This removes redundant adds of 0 from a large number of tests (~200), many which I am still working on updating.
In preparation for removing redundant WideIV steps added in https://github.com/llvm/llvm-project/pull/119284.
---
Patch is 69.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123655.diff
34 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+2)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+3)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+9-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+2)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll (+2-3)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll (+3-4)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+8-16)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll (+4-8)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/ordered-reduction.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll (+6-8)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/extract-from-end-vector-constant.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll (+9-12)
- (modified) llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/opaque-ptr.ll (+4-8)
- (modified) llvm/test/Transforms/LoopVectorize/pointer-induction.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/pr35773.ll (+1-3)
- (modified) llvm/test/Transforms/LoopVectorize/pr37248.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll (+3-3)
- (modified) llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll (+2-3)
- (modified) llvm/test/Transforms/LoopVectorize/uniform-blend.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll (+57-14)
- (modified) llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll (+4-6)
- (modified) llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll (+1-2)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 29f3940ed6fa7a..7566833720cf69 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7652,6 +7652,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
OrigLoop->getHeader()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ VPlanTransforms::simplifyRecipes(BestVPlan, Legal->getWidestInductionType());
+ VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
// Perform the actual loop transformation.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index db45ad8aadbbe3..87887747d93141 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4026,6 +4026,9 @@ class VPlan {
UFs.insert(UF);
}
+ /// Returns true if the VPlan already has been unrolled, i.e. it has UF = 1.
+ unsigned isUnrolled() const { return UFs.size() == 1 && UFs.back() == 1; }
+
/// Return a string with the name of the plan and the applicable VFs and UFs.
std::string getName() const;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9febd612c644e1..b66c68215547b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -888,6 +888,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&R)) {
+ if (Steps->getParent()->getPlan()->isUnrolled() &&
+ Steps->getNumOperands() == 2 && vputils::onlyFirstLaneUsed(Steps)) {
+ Steps->replaceAllUsesWith(Steps->getOperand(0));
+ return;
+ }
+ }
+
VPValue *A;
if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
VPValue *Trunc = R.getVPSingleValue();
@@ -964,7 +972,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all
/// un-typed live-ins in VPTypeAnalysis.
-static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) {
+void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
VPTypeAnalysis TypeInfo(CanonicalIVTy);
@@ -1041,7 +1049,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
}
Term->eraseFromParent();
- VPlanTransforms::removeDeadRecipes(Plan);
Plan.setVF(BestVF);
Plan.setUF(BestUF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a751b8b5e8dc59..9226cf04d47491 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -138,6 +138,8 @@ struct VPlanTransforms {
/// Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToConcreteRecipes(VPlan &Plan);
+ static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy);
+
/// If there's a single exit block, optimize its phi recipes that use exiting
/// IV values by feeding them precomputed end values instead, possibly taken
/// one step backwards.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index fdf6122742b2dc..31f299b4d8cfbf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -22,8 +22,7 @@ define void @f1(ptr %A) #0 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
; CHECK-NEXT: store <vscale x 4 x i32> splat (i32 1), ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 3d4f7e0e4924bc..32b735d60385f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -487,8 +487,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
+; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
; PRED-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index a83c62b04afc77..acdc32a57750d7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -39,12 +39,11 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[IDX]], [[INDEX]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[A:%.*]], <vscale x 2 x i32> [[VEC_IND]]
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 2 x ptr> [[TMP15]], i32 0
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP17]], align 8
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[TMP18]], i32 0
; CHECK-NEXT: store <vscale x 2 x double> [[WIDE_LOAD]], ptr [[TMP19]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index cb33dfebddc237..8bfb298892ceab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -293,11 +293,10 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index 08da1dedae23e5..4136a9f9e79388 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -20,8 +20,7 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N)
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -85,8 +84,7 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
index b3157b75bc26fd..a515b10bb7d62a 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
@@ -15,13 +15,12 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4
@@ -51,7 +50,7 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) {
; CHECK: for.inc:
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_02]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[LEN]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 14818199072c28..64128fe4d0b882 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -29,8 +29,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; OUTLOOP: vector.body:
; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0
-; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]]
+; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
; OUTLOOP-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
@@ -83,8 +82,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; INLOOP: vector.body:
; INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; INLOOP-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0
-; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]]
+; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP8]], align 2
; INLOOP-NEXT: [[TMP9:%.*]] = sext <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i32>
@@ -139,8 +137,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
-; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = add i32 [[EVL_BASED_IV]], 0
-; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP8]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -196,8 +193,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true)
-; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0
-; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> [[VP_OP_LOAD]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
@@ -270,8 +266,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; OUTLOOP: vector.body:
; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
; OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
; OUTLOOP-NEXT: [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
@@ -318,8 +313,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; INLOOP: vector.body:
; INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
-; INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
; INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
; INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
@@ -373,8 +367,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
@@ -429,8 +422,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/123655
More information about the llvm-commits
mailing list