[llvm] 88aab08 - [LV] Check for hoisted safe-div selects in planContainsAdditionalSimp.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 23 13:54:44 PDT 2025
Author: Florian Hahn
Date: 2025-09-23T21:54:02+01:00
New Revision: 88aab08ae5682a21edef71b814e5ebc05e7a9450
URL: https://github.com/llvm/llvm-project/commit/88aab08ae5682a21edef71b814e5ebc05e7a9450
DIFF: https://github.com/llvm/llvm-project/commit/88aab08ae5682a21edef71b814e5ebc05e7a9450.diff
LOG: [LV] Check for hoisted safe-div selects in planContainsAdditionalSimp.
In some cases, safe-divisor selects can be hoisted out of the vector
loop. Catching all cases in the legacy cost model isn't possible, in
particular checking if all conditions guarding a division are loop
invariant.
Instead, check in planContainsAdditionalSimplifications if there are any
hoisted safe-divisor selects. If so, don't compare to the more
inaccurate legacy cost model.
Fixes https://github.com/llvm/llvm-project/issues/160354.
Fixes https://github.com/llvm/llvm-project/issues/160356.
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca092dcfcb492..30fcc9b7680ed 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2907,15 +2907,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
InstructionCost SafeDivisorCost = 0;
auto *VecTy = toVectorTy(I->getType(), VF);
- auto *DivisorI = dyn_cast<Instruction>(I->getOperand(1));
- if (DivisorI && !Legal->isInvariant(DivisorI)) {
- // The cost of the select guard to ensure all lanes are well defined
- // after we speculate above any internal control flow.
- SafeDivisorCost +=
- TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
- toVectorTy(Type::getInt1Ty(I->getContext()), VF),
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- }
+ // The cost of the select guard to ensure all lanes are well defined
+ // after we speculate above any internal control flow.
+ SafeDivisorCost +=
+ TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
+ toVectorTy(Type::getInt1Ty(I->getContext()), VF),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
SmallVector<const Value *, 4> Operands(I->operand_values());
SafeDivisorCost += TTI.getArithmeticInstrCost(
@@ -6908,6 +6905,28 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
return nullptr;
};
+ // Check if a select for a safe divisor was hoisted to the pre-header. If so,
+ // the select doesn't need to be considered for the vector loop cost; go with
+ // the more accurate VPlan-based cost model.
+ for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (!VPI || VPI->getOpcode() != Instruction::Select ||
+ VPI->getNumUsers() != 1)
+ continue;
+
+ if (auto *WR = dyn_cast<VPWidenRecipe>(*VPI->user_begin())) {
+ switch (WR->getOpcode()) {
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ return true;
+ default:
+ break;
+ }
+ }
+ }
+
DenseSet<Instruction *> SeenInstrs;
auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index efce4bdf712a0..1dcd665817196 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux"
; Test case from https://github.com/llvm/llvm-project/issues/148431.
define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 %n, i64 %off) #0 {
; CHECK-LABEL: define void @test_predicated_load_cast_hint(
-; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) {
+; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[N_EXT:%.*]] = sext i8 [[N]] to i32
; CHECK-NEXT: [[N_SUB:%.*]] = add i32 [[N_EXT]], -15
@@ -66,205 +66,64 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP2]], 15
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP2]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE50:.*]] ]
-; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i8 [[DOTCAST]], 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT17]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT18]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP25:%.*]] = icmp ule <16 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <16 x i8> poison, i8 [[TMP26]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT19]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT20]] to <16 x i64>
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP25]], i32 0
-; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
+; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
-; CHECK-NEXT: [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP30:%.*]] = zext i8 [[TMP29]] to i64
-; CHECK-NEXT: [[TMP31:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP30]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0
-; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP32]], 1
-; CHECK-NEXT: store i64 [[TMP33]], ptr [[TMP31]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; CHECK: [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP25]], i32 1
-; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
-; CHECK: [[PRED_STORE_IF21]]:
-; CHECK-NEXT: [[TMP35:%.*]] = add i8 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP36:%.*]] = zext i8 [[TMP35]] to i64
-; CHECK-NEXT: [[TMP37:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP36]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1
-; CHECK-NEXT: [[TMP39:%.*]] = or i64 [[TMP38]], 1
-; CHECK-NEXT: store i64 [[TMP39]], ptr [[TMP37]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
-; CHECK: [[PRED_STORE_CONTINUE22]]:
-; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP25]], i32 2
-; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
-; CHECK: [[PRED_STORE_IF23]]:
-; CHECK-NEXT: [[TMP41:%.*]] = add i8 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP42:%.*]] = zext i8 [[TMP41]] to i64
-; CHECK-NEXT: [[TMP43:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP42]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2
-; CHECK-NEXT: [[TMP45:%.*]] = or i64 [[TMP44]], 1
-; CHECK-NEXT: store i64 [[TMP45]], ptr [[TMP43]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]]
-; CHECK: [[PRED_STORE_CONTINUE24]]:
-; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i1> [[TMP25]], i32 3
-; CHECK-NEXT: br i1 [[TMP46]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
-; CHECK: [[PRED_STORE_IF25]]:
-; CHECK-NEXT: [[TMP47:%.*]] = add i8 [[OFFSET_IDX]], 12
-; CHECK-NEXT: [[TMP48:%.*]] = zext i8 [[TMP47]] to i64
-; CHECK-NEXT: [[TMP49:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP48]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3
-; CHECK-NEXT: [[TMP51:%.*]] = or i64 [[TMP50]], 1
-; CHECK-NEXT: store i64 [[TMP51]], ptr [[TMP49]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]]
-; CHECK: [[PRED_STORE_CONTINUE26]]:
-; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP25]], i32 4
-; CHECK-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
-; CHECK: [[PRED_STORE_IF27]]:
-; CHECK-NEXT: [[TMP53:%.*]] = add i8 [[OFFSET_IDX]], 16
-; CHECK-NEXT: [[TMP54:%.*]] = zext i8 [[TMP53]] to i64
-; CHECK-NEXT: [[TMP55:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP54]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4
-; CHECK-NEXT: [[TMP57:%.*]] = or i64 [[TMP56]], 1
-; CHECK-NEXT: store i64 [[TMP57]], ptr [[TMP55]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]]
-; CHECK: [[PRED_STORE_CONTINUE28]]:
-; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i1> [[TMP25]], i32 5
-; CHECK-NEXT: br i1 [[TMP58]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
-; CHECK: [[PRED_STORE_IF29]]:
-; CHECK-NEXT: [[TMP59:%.*]] = add i8 [[OFFSET_IDX]], 20
-; CHECK-NEXT: [[TMP60:%.*]] = zext i8 [[TMP59]] to i64
-; CHECK-NEXT: [[TMP61:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP60]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5
-; CHECK-NEXT: [[TMP63:%.*]] = or i64 [[TMP62]], 1
-; CHECK-NEXT: store i64 [[TMP63]], ptr [[TMP61]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]]
-; CHECK: [[PRED_STORE_CONTINUE30]]:
-; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP25]], i32 6
-; CHECK-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
-; CHECK: [[PRED_STORE_IF31]]:
-; CHECK-NEXT: [[TMP65:%.*]] = add i8 [[OFFSET_IDX]], 24
-; CHECK-NEXT: [[TMP66:%.*]] = zext i8 [[TMP65]] to i64
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP66]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6
-; CHECK-NEXT: [[TMP69:%.*]] = or i64 [[TMP68]], 1
-; CHECK-NEXT: store i64 [[TMP69]], ptr [[TMP67]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE32]]
-; CHECK: [[PRED_STORE_CONTINUE32]]:
-; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i1> [[TMP25]], i32 7
-; CHECK-NEXT: br i1 [[TMP70]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
-; CHECK: [[PRED_STORE_IF33]]:
-; CHECK-NEXT: [[TMP71:%.*]] = add i8 [[OFFSET_IDX]], 28
-; CHECK-NEXT: [[TMP72:%.*]] = zext i8 [[TMP71]] to i64
-; CHECK-NEXT: [[TMP73:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP72]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7
-; CHECK-NEXT: [[TMP75:%.*]] = or i64 [[TMP74]], 1
-; CHECK-NEXT: store i64 [[TMP75]], ptr [[TMP73]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE34]]
-; CHECK: [[PRED_STORE_CONTINUE34]]:
-; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x i1> [[TMP25]], i32 8
-; CHECK-NEXT: br i1 [[TMP76]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]]
-; CHECK: [[PRED_STORE_IF35]]:
-; CHECK-NEXT: [[TMP77:%.*]] = add i8 [[OFFSET_IDX]], 32
-; CHECK-NEXT: [[TMP78:%.*]] = zext i8 [[TMP77]] to i64
-; CHECK-NEXT: [[TMP79:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP78]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8
-; CHECK-NEXT: [[TMP81:%.*]] = or i64 [[TMP80]], 1
-; CHECK-NEXT: store i64 [[TMP81]], ptr [[TMP79]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE36]]
-; CHECK: [[PRED_STORE_CONTINUE36]]:
-; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP25]], i32 9
-; CHECK-NEXT: br i1 [[TMP82]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]]
-; CHECK: [[PRED_STORE_IF37]]:
-; CHECK-NEXT: [[TMP83:%.*]] = add i8 [[OFFSET_IDX]], 36
-; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP83]] to i64
-; CHECK-NEXT: [[TMP85:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP84]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP86:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9
-; CHECK-NEXT: [[TMP87:%.*]] = or i64 [[TMP86]], 1
-; CHECK-NEXT: store i64 [[TMP87]], ptr [[TMP85]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE38]]
-; CHECK: [[PRED_STORE_CONTINUE38]]:
-; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x i1> [[TMP25]], i32 10
-; CHECK-NEXT: br i1 [[TMP88]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]]
-; CHECK: [[PRED_STORE_IF39]]:
-; CHECK-NEXT: [[TMP89:%.*]] = add i8 [[OFFSET_IDX]], 40
-; CHECK-NEXT: [[TMP90:%.*]] = zext i8 [[TMP89]] to i64
-; CHECK-NEXT: [[TMP91:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP90]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10
-; CHECK-NEXT: [[TMP93:%.*]] = or i64 [[TMP92]], 1
-; CHECK-NEXT: store i64 [[TMP93]], ptr [[TMP91]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE40]]
-; CHECK: [[PRED_STORE_CONTINUE40]]:
-; CHECK-NEXT: [[TMP94:%.*]] = extractelement <16 x i1> [[TMP25]], i32 11
-; CHECK-NEXT: br i1 [[TMP94]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]]
-; CHECK: [[PRED_STORE_IF41]]:
-; CHECK-NEXT: [[TMP95:%.*]] = add i8 [[OFFSET_IDX]], 44
-; CHECK-NEXT: [[TMP96:%.*]] = zext i8 [[TMP95]] to i64
-; CHECK-NEXT: [[TMP97:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP96]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11
-; CHECK-NEXT: [[TMP99:%.*]] = or i64 [[TMP98]], 1
-; CHECK-NEXT: store i64 [[TMP99]], ptr [[TMP97]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE42]]
-; CHECK: [[PRED_STORE_CONTINUE42]]:
-; CHECK-NEXT: [[TMP100:%.*]] = extractelement <16 x i1> [[TMP25]], i32 12
-; CHECK-NEXT: br i1 [[TMP100]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]]
-; CHECK: [[PRED_STORE_IF43]]:
-; CHECK-NEXT: [[TMP101:%.*]] = add i8 [[OFFSET_IDX]], 48
-; CHECK-NEXT: [[TMP102:%.*]] = zext i8 [[TMP101]] to i64
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP104:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12
+; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
; CHECK-NEXT: [[TMP105:%.*]] = or i64 [[TMP104]], 1
; CHECK-NEXT: store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE44]]
-; CHECK: [[PRED_STORE_CONTINUE44]]:
-; CHECK-NEXT: [[TMP106:%.*]] = extractelement <16 x i1> [[TMP25]], i32 13
-; CHECK-NEXT: br i1 [[TMP106]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46:.*]]
-; CHECK: [[PRED_STORE_IF45]]:
-; CHECK-NEXT: [[TMP107:%.*]] = add i8 [[OFFSET_IDX]], 52
-; CHECK-NEXT: [[TMP108:%.*]] = zext i8 [[TMP107]] to i64
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK: [[PRED_STORE_IF17]]:
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP110:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13
+; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
; CHECK-NEXT: [[TMP111:%.*]] = or i64 [[TMP110]], 1
; CHECK-NEXT: store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE46]]
-; CHECK: [[PRED_STORE_CONTINUE46]]:
-; CHECK-NEXT: [[TMP112:%.*]] = extractelement <16 x i1> [[TMP25]], i32 14
-; CHECK-NEXT: br i1 [[TMP112]], label %[[PRED_STORE_IF47:.*]], label %[[PRED_STORE_CONTINUE48:.*]]
-; CHECK: [[PRED_STORE_IF47]]:
-; CHECK-NEXT: [[TMP113:%.*]] = add i8 [[OFFSET_IDX]], 56
-; CHECK-NEXT: [[TMP114:%.*]] = zext i8 [[TMP113]] to i64
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
+; CHECK: [[PRED_STORE_CONTINUE18]]:
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK: [[PRED_STORE_IF19]]:
+; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP116:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14
+; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
; CHECK-NEXT: [[TMP117:%.*]] = or i64 [[TMP116]], 1
; CHECK-NEXT: store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE48]]
-; CHECK: [[PRED_STORE_CONTINUE48]]:
-; CHECK-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP25]], i32 15
-; CHECK-NEXT: br i1 [[TMP118]], label %[[PRED_STORE_IF49:.*]], label %[[PRED_STORE_CONTINUE50]]
-; CHECK: [[PRED_STORE_IF49]]:
-; CHECK-NEXT: [[TMP119:%.*]] = add i8 [[OFFSET_IDX]], 60
-; CHECK-NEXT: [[TMP120:%.*]] = zext i8 [[TMP119]] to i64
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
+; CHECK: [[PRED_STORE_CONTINUE20]]:
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK: [[PRED_STORE_IF21]]:
+; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP122:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15
+; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
; CHECK-NEXT: [[TMP123:%.*]] = or i64 [[TMP122]], 1
; CHECK-NEXT: store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE50]]
-; CHECK: [[PRED_STORE_CONTINUE50]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
+; CHECK: [[PRED_STORE_CONTINUE22]]:
; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT: [[TMP124:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP124]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = xor i1 [[TMP47]], true
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
+; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[SCALAR_PH]]:
@@ -309,6 +168,219 @@ exit:
ret void
}
+; Check computing costs for sdiv/udiv with invariant divisor and tail folding.
+; From https://github.com/llvm/llvm-project/issues/160354.
+define void @srem_sdiv_with_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #0 {
+; CHECK-LABEL: define void @srem_sdiv_with_tail_folding(
+; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT: [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]]
+; CHECK-NEXT: [[REM_1:%.*]] = add nsw i32 [[REM]], 1
+; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]]
+; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]]
+; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[DIV]], 1
+; CHECK-NEXT: [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]]
+; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]]
+; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %iv.sub = add nsw i32 %iv, -1
+ %rem = srem i32 %iv.sub, %d.0
+ %rem.1 = add nsw i32 %rem, 1
+ %c = icmp eq i32 %rem.1, %d.0
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ %div = sdiv i32 %iv.sub, %d.1
+ %add.1 = add i32 %div, 1
+ %add.1.ext = sext i32 %add.1 to i64
+ %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext
+ store i32 %iv, ptr %gep.dst, align 4
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add nuw nsw i32 %iv, 1
+ %ec = icmp ne i32 %iv.next, %end
+ br i1 %ec, label %loop.header, label %exit
+
+exit:
+ ret void
+}
+
+; Check computing costs for predicated sdiv/udiv with invariant divisor without tail folding.
+; From https://github.com/llvm/llvm-project/issues/160356.
+define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #1 {
+; CHECK-LABEL: define void @srem_sdiv_without_tail_folding(
+; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[END]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[END]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[END]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[D_0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -1)
+; CHECK-NEXT: [[TMP1:%.*]] = srem <4 x i32> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], splat (i32 1)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
+; CHECK: [[PRED_SDIV_IF]]:
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = sdiv i32 [[TMP5]], [[D_1]]
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE]]
+; CHECK: [[PRED_SDIV_CONTINUE]]:
+; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]]
+; CHECK: [[PRED_SDIV_IF1]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[D_1]]
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
+; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE2]]
+; CHECK: [[PRED_SDIV_CONTINUE2]]:
+; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP12]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
+; CHECK: [[PRED_SDIV_IF3]]:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[D_1]]
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
+; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]]
+; CHECK: [[PRED_SDIV_CONTINUE4]]:
+; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP17]], %[[PRED_SDIV_IF3]] ]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]]
+; CHECK: [[PRED_SDIV_IF5]]:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[D_1]]
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
+; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE6]]
+; CHECK: [[PRED_SDIV_CONTINUE6]]:
+; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP22]], %[[PRED_SDIV_IF5]] ]
+; CHECK-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP23]], splat (i32 1)
+; CHECK-NEXT: [[TMP25:%.*]] = sext <4 x i32> [[TMP24]] to <4 x i64>
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
+; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT: store i32 [[TMP37]], ptr [[TMP36]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP39]]
+; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[END]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT: [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]]
+; CHECK-NEXT: [[REM_1:%.*]] = add nsw i32 [[REM]], 1
+; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]]
+; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]]
+; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[DIV]], 1
+; CHECK-NEXT: [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]]
+; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]]
+; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %iv.sub = add nsw i32 %iv, -1
+ %rem = srem i32 %iv.sub, %d.0
+ %rem.1 = add nsw i32 %rem, 1
+ %c = icmp eq i32 %rem.1, %d.0
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ %div = sdiv i32 %iv.sub, %d.1
+ %add.1 = add i32 %div, 1
+ %add.1.ext = sext i32 %add.1 to i64
+ %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext
+ store i32 %iv, ptr %gep.dst, align 4
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add nuw nsw i32 %iv, 1
+ %ec = icmp ne i32 %iv.next, %end
+ br i1 %ec, label %loop.header, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="neoverse-v1" }
+attributes #1 = { "target-cpu"="neoverse-v2" }
+
!0 = distinct !{!0, !1, !2, !3}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
@@ -327,4 +399,6 @@ exit:
; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META11]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META10]]}
;.
More information about the llvm-commits
mailing list