[llvm] 28a4dea - [LV] Fix incorrectly marking a pointer indvar as 'scalar'.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 28 01:50:21 PST 2021
Author: Sander de Smalen
Date: 2021-11-28T09:49:28Z
New Revision: 28a4deab921d2d9a1e27d520fc3f745eb8d9aa02
URL: https://github.com/llvm/llvm-project/commit/28a4deab921d2d9a1e27d520fc3f745eb8d9aa02
DIFF: https://github.com/llvm/llvm-project/commit/28a4deab921d2d9a1e27d520fc3f745eb8d9aa02.diff
LOG: [LV] Fix incorrectly marking a pointer indvar as 'scalar'.
collectLoopScalars should only add non-uniform nodes to the list if they
are used by a load/store instruction that is marked as CM_Scalarize.
Before this patch, the LV incorrectly marked pointer induction variables
as 'scalar' when they required to be widened by something else,
such as a compare instruction, and weren't used by a node marked as
'CM_Scalarize'. This case is covered by sve-widen-phi.ll.
This change also allows removing some code where the LV tried to
widen the PHI nodes with a stepvector, even though it was marked as
'scalarAfterVectorization'. Now that this code is more careful about
marking instructions that need widening as 'scalar', this code has
become redundant.
Differential Revision: https://reviews.llvm.org/D114373
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
llvm/test/Transforms/LoopVectorize/pointer-induction.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f6f06b5f3aa0c..cf4bc3ce216cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1915,9 +1915,11 @@ class LoopVectorizationCostModel {
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
- /// during vectorization. Non-uniform scalarized instructions will be
- /// represented by VF values in the vectorized loop, each corresponding to an
- /// iteration of the original scalar loop.
+ /// during vectorization. collectLoopScalars should only add non-uniform nodes
+ /// to the list if they are used by a load/store instruction that is marked as
+ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
+ /// VF values in the vectorized loop, each corresponding to an iteration of
+ /// the original scalar loop.
void collectLoopScalars(ElementCount VF);
/// Keeps cost model vectorization decision and cost for instructions.
@@ -4862,38 +4864,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
- unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
-
- bool NeedsVectorIndex = !IsUniform && VF.isScalable();
- Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
- if (NeedsVectorIndex) {
- Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
- UnitStepVec = Builder.CreateStepVector(VecIVTy);
- PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
- }
+ assert((IsUniform || !State.VF.isScalable()) &&
+ "Cannot scalarize a scalable VF");
+ unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart =
createStepForVF(Builder, PtrInd->getType(), VF, Part);
- if (NeedsVectorIndex) {
- // Here we cache the whole vector, which means we can support the
- // extraction of any lane. However, in some cases the extractelement
- // instruction that is generated for scalar uses of this vector (e.g.
- // a load instruction) is not folded away. Therefore we still
- // calculate values for the first n lanes to avoid redundant moves
- // (when extracting the 0th element) and to produce scalar code (i.e.
- // additional add/gep instructions instead of expensive extractelement
- // instructions) when extracting higher-order elements.
- Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
- Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
- Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
- SclrGep->setName("next.gep");
- State.set(PhiR, SclrGep, Part);
- }
-
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
@@ -5229,38 +5207,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
!TheLoop->isLoopInvariant(V);
};
- auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
- if (!isa<PHINode>(Ptr) ||
- !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
- return false;
- auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
- if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
- return false;
- return isScalarUse(MemAccess, Ptr);
- };
-
- // A helper that evaluates a memory access's use of a pointer. If the
- // pointer is actually the pointer induction of a loop, it is being
- // inserted into Worklist. If the use will be a scalar use, and the
- // pointer is only used by memory accesses, we place the pointer in
- // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
+ // A helper that evaluates a memory access's use of a pointer. If the use will
+ // be a scalar use and the pointer is only used by memory accesses, we place
+ // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+ // PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
- if (isScalarPtrInduction(MemAccess, Ptr)) {
- Worklist.insert(cast<Instruction>(Ptr));
- LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
- << "\n");
-
- Instruction *Update = cast<Instruction>(
- cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-
- // If there is more than one user of Update (Ptr), we shouldn't assume it
- // will be scalar after vectorisation as other users of the instruction
- // may require widening. Otherwise, add it to ScalarPtrs.
- if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
- ScalarPtrs.insert(Update);
- return;
- }
- }
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5352,11 +5303,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;
+ // Returns true if \p Indvar is a pointer induction that is used directly by
+ // load/store instruction \p I.
+ auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
+ Instruction *I) {
+ return Induction.second.getKind() ==
+ InductionDescriptor::IK_PtrInduction &&
+ (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
+ };
+
// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+ IsDirectLoadStoreFromPtrIndvar(Ind, I);
});
if (!ScalarInd)
continue;
@@ -5366,7 +5328,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
auto ScalarIndUpdate =
llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
});
if (!ScalarIndUpdate)
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index 450af8c488199..a3d572cec28bc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -90,7 +90,7 @@ for.end:
;
; Same as predicate_store except we use a pointer PHI to maintain the address
;
-; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
+; CHECK: Found scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index 3d340daf3c2cd..7235e3a2f89fb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -41,35 +41,39 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP7]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP7]]
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], <vscale x 2 x i64> [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to <vscale x 2 x i8*>*
-; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP11]], <vscale x 2 x i8*>* [[TMP13]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP15]], align 1
-; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
-; CHECK-NEXT: store <vscale x 2 x i8> [[TMP16]], <vscale x 2 x i8>* [[TMP17]], align 1
-; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP11]]
+; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8** [[TMP15]] to <vscale x 2 x i8*>*
+; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP14]], <vscale x 2 x i8*>* [[TMP16]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i8*> [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <vscale x 2 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP19]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP18]] to <vscale x 2 x i8>*
+; CHECK-NEXT: store <vscale x 2 x i8> [[TMP20]], <vscale x 2 x i8>* [[TMP21]], align 1
+; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP9]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -119,32 +123,66 @@ exit: ; preds = %loop.body
define void @pointer_induction(i8* noalias %start, i64 %N) {
; CHECK-LABEL: @pointer_induction(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8*> poison, i8* [[START:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[START:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8*> poison, i8* [[START]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8*> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8*> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP6]]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], <vscale x 2 x i64> [[TMP7]]
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 1
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <vscale x 2 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP12]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP]], i64 1
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 2 x i8*> [[TMP13]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP16]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x i8*> [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* [[TMP14]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <vscale x 2 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP16]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[TMP12]], i64 1
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <vscale x 2 x i8*> [[TMP17]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP20]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP8]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[PTR_PHI]], align 1
+; CHECK-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1
+; CHECK-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDEX]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: end:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 69c3753d6ec8f..c7cba533fcea2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -125,25 +125,26 @@ for.cond.cleanup: ; preds = %for.body
define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
; CHECK-LABEL: @pointer_iv_mixed(
; CHECK: vector.body:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ %a, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ insertelement (<vscale x 2 x i32> zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP7:%.*]], %vector.body ]
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> [[TMP5]]
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i32, i32* %a, i64 [[INDEX]]
-; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[NEXT_GEP4]] to <vscale x 2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, <vscale x 2 x i32>* [[TMP6]], align 8
-; CHECK-NEXT: [[TMP7]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32** [[NEXT_GEP6]] to <vscale x 2 x i32*>*
-; CHECK-NEXT: store <vscale x 2 x i32*> [[NEXT_GEP]], <vscale x 2 x i32*>* [[TMP8]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ insertelement (<vscale x 2 x i32> zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP9:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <vscale x 2 x i64> [[TMP6]]
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]]
+; CHECK-NEXT: [[BC:%.*]] = bitcast <vscale x 2 x i32*> [[TMP7]] to <vscale x 2 x <vscale x 2 x i32>*>
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x <vscale x 2 x i32>*> [[BC]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, <vscale x 2 x i32>* [[TMP8]], align 8
+; CHECK-NEXT: [[TMP9]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[NEXT_GEP]] to <vscale x 2 x i32*>*
+; CHECK-NEXT: store <vscale x 2 x i32*> [[TMP7]], <vscale x 2 x i32*>* [[TMP10]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP5]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]]
entry:
br label %for.body
@@ -166,7 +167,51 @@ for.end:
ret i32 %tmp5
}
+define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16* %ptr) #0 {
+; CHECK-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(
+; CHECK: vector.body:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ %ptr, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, i16* [[POINTER_PHI]], <vscale x 2 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <vscale x 2 x i16*> [[TMP5]], zeroinitializer
+; CHECK-NEXT: [[BC:%.*]] = bitcast <vscale x 2 x i16*> [[TMP5]] to <vscale x 2 x <vscale x 2 x i16>*>
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <vscale x 2 x <vscale x 2 x i16>*> [[BC]], i32 0
+; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16>* [[TMP7]], i32 2, <vscale x 2 x i1> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP9:![0-9]+]]
+entry:
+ br label %for.body
+
+for.body: ; preds = %if.end, %entry
+ %iv = phi i64 [ %inc, %if.end ], [ 0, %entry ]
+ %iv.ptr = phi i16* [ %incdec.iv.ptr, %if.end ], [ %ptr, %entry ]
+ %cmp.i = icmp ne i16* %iv.ptr, null
+ br i1 %cmp.i, label %if.end.sink.split, label %if.end
+
+if.end.sink.split: ; preds = %for.body
+ store i16 0, i16* %iv.ptr, align 2
+ br label %if.end
+
+if.end: ; preds = %if.end.sink.split, %for.body
+ %incdec.iv.ptr = getelementptr inbounds i16, i16* %iv.ptr, i64 1
+ %inc = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp ult i64 %inc, 1024
+ br i1 %exitcond.not, label %for.body, label %for.end, !llvm.loop !6
+
+for.end: ; preds = %if.end, %for.end
+ %iv.ptr.1.lcssa = phi i16* [ %incdec.iv.ptr, %if.end ]
+ ret void
+}
+
attributes #0 = { vscale_range(0, 16) }
+
!0 = distinct !{!0, !1, !2, !3, !4, !5}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 0a127ad4ef88b..a16f73d16d8da 100644
--- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -399,23 +399,13 @@ for.end:
; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
; CHECK: LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
; CHECK: vector.body
+; CHECK: %pointer.phi = phi i32* [ %a, %vector.ph ], [ %ptr.ind, %vector.body ]
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %next.gep = getelementptr i32, i32* %a, i64 %index
-; CHECK: %[[I1:.+]] = or i64 %index, 1
-; CHECK: %next.gep10 = getelementptr i32, i32* %a, i64 %[[I1]]
-; CHECK: %[[I2:.+]] = or i64 %index, 2
-; CHECK: %next.gep11 = getelementptr i32, i32* %a, i64 %[[I2]]
-; CHECK: %[[I3:.+]] = or i64 %index, 3
-; CHECK: %next.gep12 = getelementptr i32, i32* %a, i64 %[[I3]]
-; CHECK: %[[V0:.+]] = insertelement <4 x i32*> poison, i32* %next.gep, i32 0
-; CHECK: %[[V1:.+]] = insertelement <4 x i32*> %[[V0]], i32* %next.gep10, i32 1
-; CHECK: %[[V2:.+]] = insertelement <4 x i32*> %[[V1]], i32* %next.gep11, i32 2
-; CHECK: %[[V3:.+]] = insertelement <4 x i32*> %[[V2]], i32* %next.gep12, i32 3
-; CHECK-NOT: getelementptr
-; CHECK: %next.gep13 = getelementptr i32*, i32** %b, i64 %index
-; CHECK-NOT: getelementptr
-; CHECK: %[[B0:.+]] = bitcast i32** %next.gep13 to <4 x i32*>*
-; CHECK: store <4 x i32*> %[[V3]], <4 x i32*>* %[[B0]], align 8
+; CHECK: %[[PTRVEC:.+]] = getelementptr i32, i32* %pointer.phi, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK: %next.gep = getelementptr i32*, i32** %b, i64 %index
+; CHECK: %[[NEXTGEPBC:.+]] = bitcast i32** %next.gep to <4 x i32*>*
+; CHECK: store <4 x i32*> %[[PTRVEC]], <4 x i32*>* %[[NEXTGEPBC]], align 8
+; CHECK: %ptr.ind = getelementptr i32, i32* %pointer.phi, i64 4
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
define i32 @pointer_iv_mixed(i32* %a, i32** %b, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index d77ce32e753ee..39b131e0290c9 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -21,13 +21,13 @@ define void @a(i8* readnone %b) {
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ null, [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 -1, i64 -2, i64 -3>
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 -1
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP5]], i32 -3
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], -1
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 -3
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -36,35 +36,43 @@ define void @a(i8* readnone %b) {
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1
; CHECK-NEXT: store i8 95, i8* [[TMP11]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1
-; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
-; CHECK: pred.store.if2:
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 1
-; CHECK-NEXT: store i8 95, i8* [[TMP13]], align 1
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]]
-; CHECK: pred.store.continue3:
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2
-; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
-; CHECK: pred.store.if4:
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 2
+; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK: pred.store.if5:
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], -1
+; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* null, i64 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP2]], i64 -1
; CHECK-NEXT: store i8 95, i8* [[TMP15]], align 1
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]]
-; CHECK: pred.store.continue5:
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3
-; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
-; CHECK: pred.store.if6:
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 3
-; CHECK-NEXT: store i8 95, i8* [[TMP17]], align 1
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]]
-; CHECK: pred.store.continue7:
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
+; CHECK: pred.store.continue6:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2
+; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK: pred.store.if7:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], -1
+; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* null, i64 [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP3]], i64 -1
+; CHECK-NEXT: store i8 95, i8* [[TMP19]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
+; CHECK: pred.store.continue8:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3
+; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
+; CHECK: pred.store.if9:
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], -1
+; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* null, i64 [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP4]], i64 -1
+; CHECK-NEXT: store i8 95, i8* [[TMP23]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; CHECK: pred.store.continue10:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 -4
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -78,8 +86,8 @@ define void @a(i8* readnone %b) {
; CHECK: for.body:
; CHECK-NEXT: [[C_05:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[C_05]], i64 -1
-; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP25]], 0
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
; CHECK: if.then:
; CHECK-NEXT: store i8 95, i8* [[INCDEC_PTR]], align 1
@@ -134,35 +142,27 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8*> poison, i8* [[NEXT_GEP4]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8*> [[TMP6]], i8* [[NEXT_GEP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP6]], i32 2
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP7]], i32 3
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP9]], i64 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8** [[TMP11]] to <4 x i8*>*
-; CHECK-NEXT: store <4 x i8*> [[TMP10]], <4 x i8*>* [[TMP12]], align 8
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1
-; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP15]], <4 x i8>* [[TMP16]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8** [[TMP4]] to <4 x i8*>*
+; CHECK-NEXT: store <4 x i8*> [[TMP3]], <4 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8*> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[TMP9]], <4 x i8>* [[TMP10]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 4
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
More information about the llvm-commits
mailing list