[llvm] [LV] Optimise users of induction variables in early exit blocks (PR #130766)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 05:54:07 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: David Sherwood (david-arm)
<details>
<summary>Changes</summary>
This is the second of two PRs that attempts to improve the IR
generated in the exit blocks of vectorised loops with uncountable
early exits. It follows on from PR #<!-- -->128880. In this PR I am
improving the generated code for users of induction variables in
early exit blocks.
This required adding a new VPInstruction called FirstActiveLane,
which calculates the index of the first active predicate in the
mask operand.
I have added a new function optimizeEarlyExitInductionUser that
is called from optimizeInductionExitUsers when handling users in
early exit blocks.
---
Patch is 96.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130766.diff
6 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+9)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+66-5)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll (+10-25)
- (modified) llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll (+583-120)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 43dc30c40bb53..979123d1ab0af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,6 +882,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
// Extracts the first active lane of a vector, where the first operand is
// the predicate, and the second operand is the vector to extract.
ExtractFirstActive,
+ // Calculates the first active lane index of the vector predicate operand.
+ FirstActiveLane,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 6f6875f0e5e0e..2aea5f6286668 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -85,6 +85,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return VecTy->getElementType();
return BaseTy;
}
+ case VPInstruction::FirstActiveLane:
+ return IntegerType::get(Ctx, 64);
case VPInstruction::LogicalAnd:
assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a8b304271f0da..2f11ce7e7fe71 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -712,6 +712,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
Builder.getInt64Ty(), Mask, true, "first.active.lane");
return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
}
+ case VPInstruction::FirstActiveLane: {
+ Value *Mask = State.get(getOperand(0));
+ return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
+ true, "first.active.lane");
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -754,6 +759,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
getOpcode() == VPInstruction::ExtractFirstActive ||
+ getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -946,6 +952,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractFirstActive:
O << "extract-first-active";
break;
+ case VPInstruction::FirstActiveLane:
+ O << "first-active-lane";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b44e95a2471c..ecb5b64461a64 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -737,6 +737,64 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
return IsWideIVInc() ? WideIV : nullptr;
}
+/// Attempts to optimize the induction variable exit values for users in the
+/// early exit block.
+static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
+ VPTypeAnalysis &TypeInfo,
+ VPBlockBase *PredVPBB,
+ VPValue *Op) {
+ using namespace VPlanPatternMatch;
+
+ VPValue *Incoming, *Mask;
+ if (!match(Op, m_VPInstruction<VPInstruction::ExtractFirstActive>(
+ m_VPValue(Incoming), m_VPValue(Mask))))
+ return nullptr;
+
+ auto *WideIV = getOptimizableIVOf(Incoming);
+ if (!WideIV)
+ return nullptr;
+
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ // Calculate the final index.
+ VPValue *EndValue = Plan.getCanonicalIV();
+ auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ VPBuilder B(cast<VPBasicBlock>(PredVPBB));
+
+ DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
+ VPValue *FirstActiveLane =
+ B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
+ if (CanonicalIVType != TypeInfo.inferScalarType(FirstActiveLane)) {
+ Instruction::CastOps CastOp = CanonicalIVType->getScalarSizeInBits() < 64
+ ? Instruction::Trunc
+ : Instruction::ZExt;
+ FirstActiveLane =
+ B.createScalarCast(CastOp, FirstActiveLane, CanonicalIVType, DL);
+ }
+ EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
+
+ // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
+ // changed it means the exit is using the incremented value, so we need to
+ // add the step.
+ if (Incoming != WideIV) {
+ VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+ EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
+ }
+
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ EndValue = B.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, EndValue, Step);
+ }
+
+ return EndValue;
+}
+
/// Attempts to optimize the induction variable exit values for users in the
/// exit block coming from the latch in the original scalar loop.
static VPValue *
@@ -799,12 +857,15 @@ void VPlanTransforms::optimizeInductionExitUsers(
break;
for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
+ VPValue *Escape = nullptr;
if (PredVPBB == MiddleVPBB)
- if (VPValue *Escape = optimizeLatchExitInductionUser(
- Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx),
- EndValues))
- ExitIRI->setOperand(Idx, Escape);
- // TODO: Optimize early exit induction users in follow-on patch.
+ Escape = optimizeLatchExitInductionUser(
+ Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
+ else
+ Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
+ ExitIRI->getOperand(Idx));
+ if (Escape)
+ ExitIRI->setOperand(Idx, Escape);
}
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 5f926db1131f6..02a16234b86cb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -25,16 +25,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP6:%.*]] = add i64 3, [[N_VEC]]
-; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
-; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 16 x i64> [[TMP7]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> splat (i64 3), [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP10]]
@@ -48,7 +41,6 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 16 x i1> [[TMP15]], splat (i1 true)
; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.split:
@@ -58,7 +50,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 16 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -140,7 +133,8 @@ define i64 @same_exit_block_pre_inc_use4() {
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -197,7 +191,6 @@ define i64 @loop_contains_safe_call() #1 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[TMP0]]
@@ -209,7 +202,6 @@ define i64 @loop_contains_safe_call() #1 {
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.split:
@@ -218,7 +210,8 @@ define i64 @loop_contains_safe_call() #1 {
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -283,16 +276,9 @@ define i64 @loop_contains_safe_div() #1 {
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
-; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 4 x i64> [[TMP16]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 3), [[TMP17]]
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 3, [[INDEX2]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX1]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
@@ -304,7 +290,6 @@ define i64 @loop_contains_safe_div() #1 {
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], splat (i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.split:
@@ -314,7 +299,8 @@ define i64 @loop_contains_safe_div() #1 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -375,7 +361,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
@@ -389,7 +374,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.split:
@@ -399,7 +383,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 9cb4b6f9d8680..82b908eab4871 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -15,7 +15,6 @@ define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
@@ -29,7 +28,6 @@ define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.split:
@@ -38,7 +36,8 @@ define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -86,6 +85,427 @@ loop.end:
}
+define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
+; CHECK-LABEL: define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/130766
More information about the llvm-commits
mailing list