[llvm] [VPlan] Don't apply predication discount to non-originally-predicated blocks (PR #160449)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 23 23:06:49 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
Split off from #<!-- -->158690. Currently if an instruction needs predicated due to tail folding, it will also have a predicated discount applied to it in multiple places.
This is likely inaccurate because we can expect a tail folded instruction to be executed on every iteration bar the last.
This fixes it by checking if the instruction/block was originally predicated, and in doing so prevents vectorization with tail folding where we would have had to scalarize the memory op anyway.
---
Patch is 56.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160449.diff
9 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+29-4)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+3-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanHelpers.h (+4-15)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+12-57)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll (+15-232)
- (modified) llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll (+6-178)
- (modified) llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll (+6-6)
- (modified) llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll (+2-64)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca092dcfcb492..c0c2063ca81b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1249,6 +1249,25 @@ class LoopVectorizationCostModel {
/// Superset of instructions that return true for isScalarWithPredication.
bool isPredicatedInst(Instruction *I) const;
+ /// A helper function that returns how much we should divide the cost of a
+ /// predicated block by. Typically this is the reciprocal of the block
+ /// probability, i.e. if we return X we are assuming the predicated block will
+ /// execute once for every X iterations of the loop header so the block should
+ /// only contribute 1/X of its cost to the total cost calculation, but when
+ /// optimizing for code size it will just be 1 as code size costs don't depend
+ /// on execution probabilities.
+ ///
+ /// TODO: We should use actual block probability here, if available.
+ /// Currently, we always assume predicated blocks have a 50% chance of
+ /// executing.
+ inline unsigned
+ getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
+ BasicBlock *BB) const {
+ if (!Legal->blockNeedsPredication(BB))
+ return 1;
+ return CostKind == TTI::TCK_CodeSize ? 1 : 2;
+ }
+
/// Return the costs for our two available strategies for lowering a
/// div/rem operation which requires speculating at least one lane.
/// First result is for scalarization (will be invalid for scalable
@@ -2902,7 +2921,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
- ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
+ ScalarizationCost =
+ ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
}
InstructionCost SafeDivisorCost = 0;
@@ -5035,7 +5055,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
}
// Scale the total scalar cost by block probability.
- ScalarCost /= getPredBlockCostDivisor(CostKind);
+ ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
// Compute the discount. A non-negative discount means the vector version
// of the instruction costs more, and scalarizing would be beneficial.
@@ -5088,7 +5108,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// cost by the probability of executing it. blockNeedsPredication from
// Legal is used so as to not include all blocks in tail folded loops.
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
- BlockCost /= getPredBlockCostDivisor(CostKind);
+ BlockCost /= getPredBlockCostDivisor(CostKind, BB);
Cost += BlockCost;
}
@@ -5167,7 +5187,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
if (isPredicatedInst(I)) {
- Cost /= getPredBlockCostDivisor(CostKind);
+ Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
// Add the cost of an i1 extract and a branch
auto *VecI1Ty =
@@ -6727,6 +6747,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
SkipCostComputation.contains(UI);
}
+unsigned VPCostContext::getPredBlockCostDivisor(
+ TargetTransformInfo::TargetCostKind CostKind, BasicBlock *BB) const {
+ return CM.getPredBlockCostDivisor(CostKind, BB);
+}
+
InstructionCost
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
VPCostContext &CostCtx) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a1c6f7977885f..e3b0c2bff9d02 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
// For the scalar case, we may not always execute the original predicated
// block, Thus, scale the block's cost by the probability of executing it.
if (VF.isScalar())
- return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
+ if (auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Then))
+ return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind,
+ VPIRBB->getIRBasicBlock());
return ThenCost;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774b7c838..fe082851ca00c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step);
-/// A helper function that returns how much we should divide the cost of a
-/// predicated block by. Typically this is the reciprocal of the block
-/// probability, i.e. if we return X we are assuming the predicated block will
-/// execute once for every X iterations of the loop header so the block should
-/// only contribute 1/X of its cost to the total cost calculation, but when
-/// optimizing for code size it will just be 1 as code size costs don't depend
-/// on execution probabilities.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-/// we always assume predicated blocks have a 50% chance of executing.
-inline unsigned
-getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
- return CostKind == TTI::TCK_CodeSize ? 1 : 2;
-}
-
/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
/// [1, 16) = {1, 2, 4, 8}
@@ -364,6 +349,10 @@ struct VPCostContext {
/// has already been pre-computed.
bool skipCostComputation(Instruction *UI, bool IsVector) const;
+ /// \returns how much the cost of a predicated block should be divided by.
+ unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
+ BasicBlock *BB) const;
+
/// Returns the OperandInfo for \p V, if it is a live-in.
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa3de3613b68e..2e77b75b16e47 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3170,7 +3170,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
- ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
+ ScalarCost /= Ctx.getPredBlockCostDivisor(Ctx.CostKind, UI->getParent());
return ScalarCost;
}
case Instruction::Load:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index e4ee6776ae24c..790e1d20b6ec1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
;
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
; COMMON-SAME: ptr [[DST:%.*]]) {
-; COMMON-NEXT: [[ENTRY:.*:]]
-; COMMON-NEXT: br label %[[VECTOR_PH:.*]]
-; COMMON: [[VECTOR_PH]]:
-; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
-; COMMON: [[VECTOR_BODY]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; COMMON: [[PRED_STORE_IF]]:
-; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
-; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; COMMON: [[PRED_STORE_CONTINUE]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; COMMON: [[PRED_STORE_IF1]]:
-; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
-; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
-; COMMON: [[PRED_STORE_CONTINUE2]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; COMMON: [[PRED_STORE_IF3]]:
-; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
-; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
-; COMMON: [[PRED_STORE_CONTINUE4]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
-; COMMON: [[PRED_STORE_IF5]]:
-; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
-; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
-; COMMON: [[PRED_STORE_CONTINUE6]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; COMMON: [[PRED_STORE_IF7]]:
-; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
-; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
-; COMMON: [[PRED_STORE_CONTINUE8]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; COMMON: [[PRED_STORE_IF9]]:
-; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
-; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
-; COMMON: [[PRED_STORE_CONTINUE10]]:
-; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
-; COMMON: [[PRED_STORE_IF11]]:
-; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
-; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
-; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
-; COMMON: [[PRED_STORE_IF13]]:
-; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
-; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[EXIT]]
-; COMMON: [[EXIT]]:
-; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
-; COMMON: [[SCALAR_PH]]:
-; COMMON-NEXT: br [[EXIT1:label %.*]]
-; COMMON: [[SCALAR_PH1:.*:]]
+; COMMON-NEXT: [[ENTRY:.*]]:
+; COMMON-NEXT: br label %[[EXIT1:.*]]
+; COMMON: [[EXIT1]]:
+; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ]
+; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
+; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
+; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
+; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]]
+; COMMON: [[SCALAR_PH1]]:
+; COMMON-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index cc7b4aecc3642..71c2a05af964f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -274,69 +274,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
;
; PRED-LABEL: define void @iv_trunc(
; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT: [[ENTRY:.*:]]
+; PRED-NEXT: [[ENTRY:.*]]:
; PRED-NEXT: [[MUL_X:%.*]] = add i32 [[X]], 1
-; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED: [[VECTOR_SCEVCHECK]]:
-; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
-; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
-; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
-; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED: [[VECTOR_PH]]:
-; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2
-; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
-; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
-; PRED: [[VECTOR_BODY]]:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT: [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64>
-; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED: [[PRED_STORE_IF]]:
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0
-; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT: store i32 1, ptr [[TMP20]], align 4
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; PRED: [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; PRED: [[PRED_STORE_IF1]]:
-; PRED-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1
-; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
-; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]]
-; PRED: [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT: [[TMP25:%.*]] = xor i1 [[TMP24]], true
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PRED: [[MIDDLE_BLOCK]]:
-; PRED-NEXT: br label %[[EXIT:.*]]
-; PRED: [[SCALAR_PH]]:
; PRED-NEXT: br label %[[FOR_BODY:.*]]
; PRED: [[FOR_BODY]]:
-; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; PRED-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
; PRED-NEXT: [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
; PRED-NEXT: [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
@@ -344,7 +286,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 1, ptr [[GEP]], align 4
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[FOR_BODY]]
; PRED: [[EXIT]]:
; PRED-NEXT: ret void
;
@@ -440,101 +382,21 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
;
; PRED-LABEL: define void @trunc_ivs_and_store(
; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT: [[ENTRY:.*:]]
-; PRED-NEXT: [[MUL:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED: [[VECTOR_SCEVCHECK]]:
+; PRED-NEXT: [[ENTRY:.*]]:
; PRED-NEXT: [[TMP1:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP1]]
-; PRED-NEXT: [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
-; PRED-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
-; PRED-NEXT: [[TMP5:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
-; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; PRED-NEXT: [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
-; PRED-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
-; PRED-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
-; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT: [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
-; PRED-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; PRED-NEXT: br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED: [[VECTOR_PH]]:
-; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
-; PRED: [[VECTOR_BODY]]:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED: [[PRED_STORE_IF]]:
-; PRED-NEXT: [...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/160449
More information about the llvm-commits
mailing list