[llvm] 0aac227 - [LV] Correctly cost chains of replicating calls in legacy CM.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 31 07:14:07 PDT 2025
Author: Florian Hahn
Date: 2025-08-31T15:13:47+01:00
New Revision: 0aac22758a81a98d9612ed1ad4853d9e434e8451
URL: https://github.com/llvm/llvm-project/commit/0aac22758a81a98d9612ed1ad4853d9e434e8451
DIFF: https://github.com/llvm/llvm-project/commit/0aac22758a81a98d9612ed1ad4853d9e434e8451.diff
LOG: [LV] Correctly cost chains of replicating calls in legacy CM.
Check for scalarized calls in needsExtract to fix a divergence between
legacy and VPlan-based cost model.
The legacy cost model was missing a check for scalarized calls in
needsExtract, which meant if incorrectly assumed the result of a
scalarized call needs extracting.
Exposed by https://github.com/llvm/llvm-project/pull/154617.
Fixes https://github.com/llvm/llvm-project/issues/156091.
Added:
llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56403dba92df2..1f4610ad191ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1134,7 +1134,10 @@ class LoopVectorizationCostModel {
CallWideningDecision getCallWideningDecision(CallInst *CI,
ElementCount VF) const {
assert(!VF.isScalar() && "Expected vector VF");
- return CallWideningDecisions.at({CI, VF});
+ auto I = CallWideningDecisions.find({CI, VF});
+ if (I == CallWideningDecisions.end())
+ return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+ return I->second;
}
/// Return True if instruction \p I is an optimizable truncate whose operand
@@ -1657,7 +1660,9 @@ class LoopVectorizationCostModel {
Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
TheLoop->isLoopInvariant(I) ||
- getWideningDecision(I, VF) == CM_Scalarize)
+ getWideningDecision(I, VF) == CM_Scalarize ||
+ (isa<CallInst>(I) &&
+ getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
return false;
// Assume we can vectorize V (and hence we need extraction) if the
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
new file mode 100644
index 0000000000000..67f0201476602
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -0,0 +1,599 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Test case for https://github.com/llvm/llvm-project/issues/156091.
+define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B, ptr align 4 noalias %C, ptr align 4 noalias %D, ptr noalias %E) #0 {
+; CHECK-LABEL: @test_replicate_call_chain(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP9]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison)
+; CHECK-NEXT: [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00)
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1
+; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2
+; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3
+; CHECK-NEXT: [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4
+; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5
+; CHECK-NEXT: [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6
+; CHECK-NEXT: [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7
+; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8
+; CHECK-NEXT: [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9
+; CHECK-NEXT: [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10
+; CHECK-NEXT: [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11
+; CHECK-NEXT: [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12
+; CHECK-NEXT: [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13
+; CHECK-NEXT: [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14
+; CHECK-NEXT: [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15
+; CHECK-NEXT: [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]])
+; CHECK-NEXT: [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]])
+; CHECK-NEXT: [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]])
+; CHECK-NEXT: [[TMP45:%.*]] = tail call float @llvm.pow.f32(float [[TMP16]], float [[X]])
+; CHECK-NEXT: [[TMP46:%.*]] = tail call float @llvm.pow.f32(float [[TMP18]], float [[X]])
+; CHECK-NEXT: [[TMP47:%.*]] = tail call float @llvm.pow.f32(float [[TMP20]], float [[X]])
+; CHECK-NEXT: [[TMP48:%.*]] = tail call float @llvm.pow.f32(float [[TMP22]], float [[X]])
+; CHECK-NEXT: [[TMP49:%.*]] = tail call float @llvm.pow.f32(float [[TMP24]], float [[X]])
+; CHECK-NEXT: [[TMP50:%.*]] = tail call float @llvm.pow.f32(float [[TMP26]], float [[X]])
+; CHECK-NEXT: [[TMP51:%.*]] = tail call float @llvm.pow.f32(float [[TMP28]], float [[X]])
+; CHECK-NEXT: [[TMP52:%.*]] = tail call float @llvm.pow.f32(float [[TMP30]], float [[X]])
+; CHECK-NEXT: [[TMP53:%.*]] = tail call float @llvm.pow.f32(float [[TMP32]], float [[X]])
+; CHECK-NEXT: [[TMP54:%.*]] = tail call float @llvm.pow.f32(float [[TMP34]], float [[X]])
+; CHECK-NEXT: [[TMP55:%.*]] = tail call float @llvm.pow.f32(float [[TMP36]], float [[X]])
+; CHECK-NEXT: [[TMP56:%.*]] = tail call float @llvm.pow.f32(float [[TMP38]], float [[X]])
+; CHECK-NEXT: [[TMP57:%.*]] = tail call float @llvm.pow.f32(float [[TMP40]], float [[X]])
+; CHECK-NEXT: [[TMP58:%.*]] = tail call float @llvm.pow.f32(float [[TMP42]], float [[X]])
+; CHECK-NEXT: [[TMP59:%.*]] = insertelement <16 x float> poison, float [[TMP43]], i32 0
+; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x float> [[TMP59]], float [[TMP44]], i32 1
+; CHECK-NEXT: [[TMP61:%.*]] = insertelement <16 x float> [[TMP60]], float [[TMP45]], i32 2
+; CHECK-NEXT: [[TMP62:%.*]] = insertelement <16 x float> [[TMP61]], float [[TMP46]], i32 3
+; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x float> [[TMP62]], float [[TMP47]], i32 4
+; CHECK-NEXT: [[TMP64:%.*]] = insertelement <16 x float> [[TMP63]], float [[TMP48]], i32 5
+; CHECK-NEXT: [[TMP65:%.*]] = insertelement <16 x float> [[TMP64]], float [[TMP49]], i32 6
+; CHECK-NEXT: [[TMP66:%.*]] = insertelement <16 x float> [[TMP65]], float [[TMP50]], i32 7
+; CHECK-NEXT: [[TMP67:%.*]] = insertelement <16 x float> [[TMP66]], float [[TMP51]], i32 8
+; CHECK-NEXT: [[TMP68:%.*]] = insertelement <16 x float> [[TMP67]], float [[TMP52]], i32 9
+; CHECK-NEXT: [[TMP69:%.*]] = insertelement <16 x float> [[TMP68]], float [[TMP53]], i32 10
+; CHECK-NEXT: [[TMP70:%.*]] = insertelement <16 x float> [[TMP69]], float [[TMP54]], i32 11
+; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x float> [[TMP70]], float [[TMP55]], i32 12
+; CHECK-NEXT: [[TMP72:%.*]] = insertelement <16 x float> [[TMP71]], float [[TMP56]], i32 13
+; CHECK-NEXT: [[TMP73:%.*]] = insertelement <16 x float> [[TMP72]], float [[TMP57]], i32 14
+; CHECK-NEXT: [[TMP74:%.*]] = insertelement <16 x float> [[TMP73]], float [[TMP58]], i32 15
+; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr [[TMP5]], i32 4, <16 x i1> [[TMP7]])
+; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr [[TMP5]], i32 4, <16 x i1> [[TMP6]])
+; CHECK-NEXT: store float 0.000000e+00, ptr [[E:%.*]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT: br i1 [[TMP75]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 100, [[ENTRY]] ]
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[DEC_IV_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT: [[IV_INC:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[L_A:%.*]] = load float, ptr [[GEP_A]], align 4
+; CHECK-NEXT: [[C_A:%.*]] = fcmp ogt float [[L_A]], 0.000000e+00
+; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT: [[L_B:%.*]] = load float, ptr [[GEP_B]], align 4
+; CHECK-NEXT: [[C_B:%.*]] = fcmp ogt float [[L_B]], 0.000000e+00
+; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr float, ptr [[C]], i64 [[IV_INC]]
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_A]], [[C_B]]
+; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK: then:
+; CHECK-NEXT: store float 0.000000e+00, ptr [[GEP_C]], align 4
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: else:
+; CHECK-NEXT: [[IV_MUL_2:%.*]] = shl i64 [[IV]], 2
+; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV_MUL_2]]
+; CHECK-NEXT: [[L_D:%.*]] = load float, ptr [[GEP_D]], align 4
+; CHECK-NEXT: [[MUL:%.*]] = fmul float [[L_D]], 2.000000e+00
+; CHECK-NEXT: [[POW_1:%.*]] = tail call float @llvm.pow.f32(float [[MUL]], float [[X]])
+; CHECK-NEXT: [[POW_2:%.*]] = tail call float @llvm.pow.f32(float [[POW_1]], float [[X]])
+; CHECK-NEXT: store float [[POW_2]], ptr [[GEP_C]], align 4
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: loop.latch:
+; CHECK-NEXT: store float 0.000000e+00, ptr [[E]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[DEC_IV_NEXT]] = add i64 [[DEC_IV]], -1
+; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[DEC_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %dec.iv = phi i64 [ 100, %entry ], [ %dec.iv.next, %loop.latch ]
+ %iv.inc = add i64 %iv, 1
+ %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+ %l.A = load float, ptr %gep.A, align 4
+ %c.A = fcmp ogt float %l.A, 0.0
+ %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
+ %l.B = load float, ptr %gep.B, align 4
+ %c.B = fcmp ogt float %l.B, 0.0
+ %gep.C = getelementptr float, ptr %C, i64 %iv.inc
+ %and = and i1 %c.A, %c.B
+ br i1 %and, label %then, label %else
+
+then:
+ store float 0.0, ptr %gep.C, align 4
+ br label %loop.latch
+
+else:
+ %iv.mul.2 = shl i64 %iv, 2
+ %gep.D = getelementptr i8, ptr %D, i64 %iv.mul.2
+ %l.D = load float, ptr %gep.D, align 4
+ %mul = fmul float %l.D, 2.0
+ %pow.1 = tail call float @llvm.pow.f32(float %mul, float %x)
+ %pow.2 = tail call float @llvm.pow.f32(float %pow.1, float %x)
+ store float %pow.2, ptr %gep.C, align 4
+ br label %loop.latch
+
+loop.latch:
+ store float 0.000000e+00, ptr %E, align 4
+ %iv.next = add i64 %iv, 1
+ %dec.iv.next = add i64 %dec.iv, -1
+ %ec = icmp ne i64 %dec.iv.next, 0
+ br i1 %ec, label %loop.header, label %exit
+
+exit:
+ ret void
+}
+
+define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
+; CHECK-LABEL: @avx512_cond_load_cost(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 [[IV]], 0
+; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT: [[MUL:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
+; CHECK-NEXT: [[EXT:%.*]] = sext i32 [[OR]] to i64
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
+; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT: [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: loop.latch:
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
+; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
+; CHECK-NEXT: ret i64 [[RES_LCSSA]]
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %c.1 = icmp slt i32 %iv, 0
+ br i1 %c.1, label %if.then, label %loop.latch
+
+if.then:
+ %1 = urem i32 %a, %c
+ %mul = sub i32 0, %1
+ %div = udiv i32 %c, %d
+ %or = or i32 %div, %mul
+ %ext = sext i32 %or to i64
+ %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
+ %l = load i64, ptr %gep, align 8
+ %or.2 = or i64 %l, %b
+ br label %loop.latch
+
+loop.latch:
+ %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
+ %iv.next = add i32 %iv, 1
+ %ec = icmp ult i32 %iv, %c
+ br i1 %ec, label %loop.header, label %exit
+
+exit:
+ ret i64 %res
+}
+
+define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
+; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
+; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP25]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
+; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK: pred.store.if8:
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]]
+; CHECK: pred.store.continue9:
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
+; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; CHECK: pred.store.if10:
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]]
+; CHECK: pred.store.continue11:
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
+; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; CHECK: pred.store.if12:
+; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]]
+; CHECK: pred.store.continue13:
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; CHECK: pred.store.if14:
+; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]]
+; CHECK: pred.store.continue15:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
+; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; CHECK: pred.store.if16:
+; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]]
+; CHECK: pred.store.continue17:
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
+; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; CHECK: pred.store.if18:
+; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]]
+; CHECK: pred.store.continue19:
+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
+; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; CHECK: pred.store.if20:
+; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
+; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]]
+; CHECK: pred.store.continue21:
+; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
+; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; CHECK: pred.store.if22:
+; CHECK-NEXT: [[TMP107:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2
+; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]]
+; CHECK: pred.store.continue23:
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
+; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; CHECK: pred.store.if24:
+; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
+; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]]
+; CHECK: pred.store.continue25:
+; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
+; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; CHECK: pred.store.if26:
+; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; CHECK: pred.store.continue27:
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
+; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; CHECK: pred.store.if28:
+; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]]
+; CHECK: pred.store.continue29:
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
+; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
+; CHECK: pred.store.if30:
+; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]]
+; CHECK: pred.store.continue31:
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
+; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
+; CHECK: pred.store.if32:
+; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]]
+; CHECK: pred.store.continue33:
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
+; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
+; CHECK: pred.store.if34:
+; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
+; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]]
+; CHECK: pred.store.continue35:
+; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
+; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
+; CHECK: pred.store.if36:
+; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
+; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]]
+; CHECK: pred.store.continue37:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK: vec.epilog.iter.check:
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK: vec.epilog.ph:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0
+; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]]
+; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]]
+; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK: vec.epilog.vector.body:
+; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ]
+; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2
+; CHECK-NEXT: [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]]
+; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8
+; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer
+; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0
+; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
+; CHECK: pred.store.if43:
+; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0
+; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2
+; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]]
+; CHECK: pred.store.continue44:
+; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1
+; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
+; CHECK: pred.store.if45:
+; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1
+; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2
+; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]]
+; CHECK: pred.store.continue46:
+; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2
+; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
+; CHECK: pred.store.if47:
+; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2
+; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2
+; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]]
+; CHECK: pred.store.continue48:
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3
+; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]]
+; CHECK: pred.store.if49:
+; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3
+; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2
+; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]]
+; CHECK: pred.store.continue50:
+; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4
+; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]]
+; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: vec.epilog.middle.block:
+; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK: vec.epilog.scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
+; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8
+; CHECK-NEXT: [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
+; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK: if.then:
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: loop.latch:
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %iv.shl = shl nsw i64 %iv, 2
+ %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
+ %l = load double, ptr %gep.0, align 8
+ %c = fcmp oeq double %l, 0.000000e+00
+ br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+ %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
+ store double 0.000000e+00, ptr %gep.1, align 8
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv, %N
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
+
+; Test for https://github.com/llvm/llvm-project/issues/129236.
+define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
+; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0
+; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32
+; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK: then:
+; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
+; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
+; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
+; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24
+; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: else:
+; CHECK-NEXT: [[REM_I]] = urem i32 -1, [[CONV_I]]
+; CHECK-NEXT: [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1
+; CHECK-NEXT: br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]]
+; CHECK: loop.latch:
+; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; CHECK: exit:
+; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
+; CHECK-NEXT: ret i32 [[P_2_LCSSA]]
+;
+entry:
+ %cmp.i = icmp eq i16 0, 0
+ %conv.i = sext i16 0 to i32
+ %conv5.i = sext i8 %a to i32
+ br label %loop.header
+
+loop.header:
+ %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ]
+ br i1 %cmp.i, label %then, label %else
+
+then:
+ %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ]
+ %shr.i = ashr i32 %conv5.i, %p.1
+ %tobool6.not.i = icmp eq i32 %shr.i, 0
+ %sext.i = shl i32 %p.1, 24
+ %2 = ashr exact i32 %sext.i, 24
+ %3 = select i1 %tobool6.not.i, i32 %2, i32 0
+ br label %loop.latch
+
+else:
+ %rem.i = urem i32 -1, %conv.i
+ %cmp3.i = icmp sgt i32 %rem.i, 1
+ br i1 %cmp3.i, label %loop.latch, label %then
+
+loop.latch:
+ %p.2 = phi i32 [ 0, %else ], [ %3, %then ]
+ %iv.next = add i8 %iv, -1
+ %ec = icmp eq i8 %iv.next, 0
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret i32 %p.2
+}
+
+attributes #0 = { "target-cpu"="znver4" }
+attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
+attributes #2 = { "target-cpu"="znver3" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 472aa0b5b716e..b4c33aa7a50bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -528,356 +528,6 @@ exit:
ret i1 %any.of.next
}
-define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
-; CHECK-LABEL: @avx512_cond_load_cost(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 [[IV]], 0
-; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK: if.then:
-; CHECK-NEXT: [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
-; CHECK-NEXT: [[MUL:%.*]] = sub i32 0, [[TMP0]]
-; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
-; CHECK-NEXT: [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
-; CHECK-NEXT: [[EXT:%.*]] = sext i32 [[OR]] to i64
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
-; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT: [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
-; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
-; CHECK-NEXT: ret i64 [[RES_LCSSA]]
-;
-entry:
- br label %loop.header
-
-loop.header:
- %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
- %c.1 = icmp slt i32 %iv, 0
- br i1 %c.1, label %if.then, label %loop.latch
-
-if.then:
- %1 = urem i32 %a, %c
- %mul = sub i32 0, %1
- %div = udiv i32 %c, %d
- %or = or i32 %div, %mul
- %ext = sext i32 %or to i64
- %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
- %l = load i64, ptr %gep, align 8
- %or.2 = or i64 %l, %b
- br label %loop.latch
-
-loop.latch:
- %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
- %iv.next = add i32 %iv, 1
- %ec = icmp ult i32 %iv, %c
- br i1 %ec, label %loop.header, label %exit
-
-exit:
- ret i64 %res
-}
-
-define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
-; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
-; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
-; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8
-; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
-; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
-; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
-; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
-; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP25]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
-; CHECK: pred.store.continue:
-; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
-; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
-; CHECK: pred.store.if8:
-; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]]
-; CHECK: pred.store.continue9:
-; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
-; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK: pred.store.if10:
-; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]]
-; CHECK: pred.store.continue11:
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
-; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK: pred.store.if12:
-; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
-; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]]
-; CHECK: pred.store.continue13:
-; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
-; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
-; CHECK: pred.store.if14:
-; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2
-; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]]
-; CHECK: pred.store.continue15:
-; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
-; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
-; CHECK: pred.store.if16:
-; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]]
-; CHECK: pred.store.continue17:
-; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
-; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
-; CHECK: pred.store.if18:
-; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
-; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]]
-; CHECK: pred.store.continue19:
-; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
-; CHECK: pred.store.if20:
-; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
-; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]]
-; CHECK: pred.store.continue21:
-; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
-; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; CHECK: pred.store.if22:
-; CHECK-NEXT: [[TMP107:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2
-; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]]
-; CHECK: pred.store.continue23:
-; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
-; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; CHECK: pred.store.if24:
-; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
-; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]]
-; CHECK: pred.store.continue25:
-; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
-; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; CHECK: pred.store.if26:
-; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
-; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]]
-; CHECK: pred.store.continue27:
-; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
-; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; CHECK: pred.store.if28:
-; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]]
-; CHECK: pred.store.continue29:
-; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
-; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; CHECK: pred.store.if30:
-; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]]
-; CHECK: pred.store.continue31:
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
-; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; CHECK: pred.store.if32:
-; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13
-; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]]
-; CHECK: pred.store.continue33:
-; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
-; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
-; CHECK: pred.store.if34:
-; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
-; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]]
-; CHECK: pred.store.continue35:
-; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
-; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
-; CHECK: pred.store.if36:
-; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
-; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]]
-; CHECK: pred.store.continue37:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0
-; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]]
-; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]]
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ]
-; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2
-; CHECK-NEXT: [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]]
-; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8
-; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer
-; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0
-; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
-; CHECK: pred.store.if43:
-; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0
-; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2
-; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]]
-; CHECK: pred.store.continue44:
-; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1
-; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
-; CHECK: pred.store.if45:
-; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1
-; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2
-; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]]
-; CHECK: pred.store.continue46:
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2
-; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
-; CHECK: pred.store.if47:
-; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2
-; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2
-; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]]
-; CHECK: pred.store.continue48:
-; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3
-; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]]
-; CHECK: pred.store.if49:
-; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3
-; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2
-; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]]
-; CHECK: pred.store.continue50:
-; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4
-; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]]
-; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
-; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
-; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8
-; CHECK-NEXT: [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
-; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK: if.then:
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
-; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
-entry:
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
- %iv.shl = shl nsw i64 %iv, 2
- %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
- %l = load double, ptr %gep.0, align 8
- %c = fcmp oeq double %l, 0.000000e+00
- br i1 %c, label %if.then, label %loop.latch
-
-if.then:
- %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
- store double 0.000000e+00, ptr %gep.1, align 8
- br label %loop.latch
-
-loop.latch:
- %iv.next = add nsw i64 %iv, 1
- %ec = icmp eq i64 %iv, %N
- br i1 %ec, label %exit, label %loop.header
-
-exit:
- ret void
-}
-
define i64 @cost_assume(ptr %end, i64 %N) {
; CHECK-LABEL: @cost_assume(
; CHECK-NEXT: entry:
@@ -1317,72 +967,6 @@ exit:
ret i32 %select.next
}
-; Test for https://github.com/llvm/llvm-project/issues/129236.
-define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
-; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0
-; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32
-; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
-; CHECK: then:
-; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
-; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
-; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
-; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
-; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24
-; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: else:
-; CHECK-NEXT: [[REM_I]] = urem i32 -1, [[CONV_I]]
-; CHECK-NEXT: [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1
-; CHECK-NEXT: br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1
-; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK: exit:
-; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
-; CHECK-NEXT: ret i32 [[P_2_LCSSA]]
-;
-entry:
- %cmp.i = icmp eq i16 0, 0
- %conv.i = sext i16 0 to i32
- %conv5.i = sext i8 %a to i32
- br label %loop.header
-
-loop.header:
- %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ]
- br i1 %cmp.i, label %then, label %else
-
-then:
- %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ]
- %shr.i = ashr i32 %conv5.i, %p.1
- %tobool6.not.i = icmp eq i32 %shr.i, 0
- %sext.i = shl i32 %p.1, 24
- %2 = ashr exact i32 %sext.i, 24
- %3 = select i1 %tobool6.not.i, i32 %2, i32 0
- br label %loop.latch
-
-else:
- %rem.i = urem i32 -1, %conv.i
- %cmp3.i = icmp sgt i32 %rem.i, 1
- br i1 %cmp3.i, label %loop.latch, label %then
-
-loop.latch:
- %p.2 = phi i32 [ 0, %else ], [ %3, %then ]
- %iv.next = add i8 %iv, -1
- %ec = icmp eq i8 %iv.next, 0
- br i1 %ec, label %exit, label %loop.header
-
-exit:
- ret i32 %p.2
-}
-
declare void @llvm.assume(i1 noundef) #0
attributes #0 = { "target-cpu"="penryn" }
More information about the llvm-commits
mailing list