[llvm] 0aac227 - [LV] Correctly cost chains of replicating calls in legacy CM.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sun Aug 31 07:14:07 PDT 2025


Author: Florian Hahn
Date: 2025-08-31T15:13:47+01:00
New Revision: 0aac22758a81a98d9612ed1ad4853d9e434e8451

URL: https://github.com/llvm/llvm-project/commit/0aac22758a81a98d9612ed1ad4853d9e434e8451
DIFF: https://github.com/llvm/llvm-project/commit/0aac22758a81a98d9612ed1ad4853d9e434e8451.diff

LOG: [LV] Correctly cost chains of replicating calls in legacy CM.

Check for scalarized calls in needsExtract to fix a divergence between
legacy and VPlan-based cost model.

The legacy cost model was missing a check for scalarized calls in
needsExtract, which meant if incorrectly assumed the result of a
scalarized call needs extracting.

Exposed by https://github.com/llvm/llvm-project/pull/154617.

Fixes https://github.com/llvm/llvm-project/issues/156091.

Added: 
    llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/X86/cost-model.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56403dba92df2..1f4610ad191ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1134,7 +1134,10 @@ class LoopVectorizationCostModel {
   CallWideningDecision getCallWideningDecision(CallInst *CI,
                                                ElementCount VF) const {
     assert(!VF.isScalar() && "Expected vector VF");
-    return CallWideningDecisions.at({CI, VF});
+    auto I = CallWideningDecisions.find({CI, VF});
+    if (I == CallWideningDecisions.end())
+      return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+    return I->second;
   }
 
   /// Return True if instruction \p I is an optimizable truncate whose operand
@@ -1657,7 +1660,9 @@ class LoopVectorizationCostModel {
     Instruction *I = dyn_cast<Instruction>(V);
     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
         TheLoop->isLoopInvariant(I) ||
-        getWideningDecision(I, VF) == CM_Scalarize)
+        getWideningDecision(I, VF) == CM_Scalarize ||
+        (isa<CallInst>(I) &&
+         getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
       return false;
 
     // Assume we can vectorize V (and hence we need extraction) if the

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
new file mode 100644
index 0000000000000..67f0201476602
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -0,0 +1,599 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Test case for https://github.com/llvm/llvm-project/issues/156091.
+define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B, ptr align 4 noalias %C, ptr align 4 noalias %D, ptr noalias %E) #0 {
+; CHECK-LABEL: @test_replicate_call_chain(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP9]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13
+; CHECK-NEXT:    [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14
+; CHECK-NEXT:    [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15
+; CHECK-NEXT:    [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]])
+; CHECK-NEXT:    [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]])
+; CHECK-NEXT:    [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]])
+; CHECK-NEXT:    [[TMP45:%.*]] = tail call float @llvm.pow.f32(float [[TMP16]], float [[X]])
+; CHECK-NEXT:    [[TMP46:%.*]] = tail call float @llvm.pow.f32(float [[TMP18]], float [[X]])
+; CHECK-NEXT:    [[TMP47:%.*]] = tail call float @llvm.pow.f32(float [[TMP20]], float [[X]])
+; CHECK-NEXT:    [[TMP48:%.*]] = tail call float @llvm.pow.f32(float [[TMP22]], float [[X]])
+; CHECK-NEXT:    [[TMP49:%.*]] = tail call float @llvm.pow.f32(float [[TMP24]], float [[X]])
+; CHECK-NEXT:    [[TMP50:%.*]] = tail call float @llvm.pow.f32(float [[TMP26]], float [[X]])
+; CHECK-NEXT:    [[TMP51:%.*]] = tail call float @llvm.pow.f32(float [[TMP28]], float [[X]])
+; CHECK-NEXT:    [[TMP52:%.*]] = tail call float @llvm.pow.f32(float [[TMP30]], float [[X]])
+; CHECK-NEXT:    [[TMP53:%.*]] = tail call float @llvm.pow.f32(float [[TMP32]], float [[X]])
+; CHECK-NEXT:    [[TMP54:%.*]] = tail call float @llvm.pow.f32(float [[TMP34]], float [[X]])
+; CHECK-NEXT:    [[TMP55:%.*]] = tail call float @llvm.pow.f32(float [[TMP36]], float [[X]])
+; CHECK-NEXT:    [[TMP56:%.*]] = tail call float @llvm.pow.f32(float [[TMP38]], float [[X]])
+; CHECK-NEXT:    [[TMP57:%.*]] = tail call float @llvm.pow.f32(float [[TMP40]], float [[X]])
+; CHECK-NEXT:    [[TMP58:%.*]] = tail call float @llvm.pow.f32(float [[TMP42]], float [[X]])
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <16 x float> poison, float [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x float> [[TMP59]], float [[TMP44]], i32 1
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <16 x float> [[TMP60]], float [[TMP45]], i32 2
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <16 x float> [[TMP61]], float [[TMP46]], i32 3
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <16 x float> [[TMP62]], float [[TMP47]], i32 4
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <16 x float> [[TMP63]], float [[TMP48]], i32 5
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <16 x float> [[TMP64]], float [[TMP49]], i32 6
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <16 x float> [[TMP65]], float [[TMP50]], i32 7
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <16 x float> [[TMP66]], float [[TMP51]], i32 8
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x float> [[TMP67]], float [[TMP52]], i32 9
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <16 x float> [[TMP68]], float [[TMP53]], i32 10
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <16 x float> [[TMP69]], float [[TMP54]], i32 11
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <16 x float> [[TMP70]], float [[TMP55]], i32 12
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <16 x float> [[TMP71]], float [[TMP56]], i32 13
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <16 x float> [[TMP72]], float [[TMP57]], i32 14
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <16 x float> [[TMP73]], float [[TMP58]], i32 15
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr [[TMP5]], i32 4, <16 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr [[TMP5]], i32 4, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[E:%.*]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP75]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 100, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[DEC_IV_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[IV_INC:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[L_A:%.*]] = load float, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[C_A:%.*]] = fcmp ogt float [[L_A]], 0.000000e+00
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load float, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_B:%.*]] = fcmp ogt float [[L_B]], 0.000000e+00
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr float, ptr [[C]], i64 [[IV_INC]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_A]], [[C_B]]
+; CHECK-NEXT:    br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_C]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[IV_MUL_2:%.*]] = shl i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV_MUL_2]]
+; CHECK-NEXT:    [[L_D:%.*]] = load float, ptr [[GEP_D]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[L_D]], 2.000000e+00
+; CHECK-NEXT:    [[POW_1:%.*]] = tail call float @llvm.pow.f32(float [[MUL]], float [[X]])
+; CHECK-NEXT:    [[POW_2:%.*]] = tail call float @llvm.pow.f32(float [[POW_1]], float [[X]])
+; CHECK-NEXT:    store float [[POW_2]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[E]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[DEC_IV_NEXT]] = add i64 [[DEC_IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[DEC_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %dec.iv = phi i64 [ 100, %entry ], [ %dec.iv.next, %loop.latch ]
+  %iv.inc = add i64 %iv, 1
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  %l.A = load float, ptr %gep.A, align 4
+  %c.A = fcmp ogt float %l.A, 0.0
+  %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
+  %l.B = load float, ptr %gep.B, align 4
+  %c.B = fcmp ogt float %l.B, 0.0
+  %gep.C = getelementptr float, ptr %C, i64 %iv.inc
+  %and = and i1 %c.A, %c.B
+  br i1 %and, label %then, label %else
+
+then:
+  store float 0.0, ptr %gep.C, align 4
+  br label %loop.latch
+
+else:
+  %iv.mul.2 = shl i64 %iv, 2
+  %gep.D = getelementptr i8, ptr %D, i64 %iv.mul.2
+  %l.D = load float, ptr %gep.D, align 4
+  %mul = fmul float %l.D, 2.0
+  %pow.1 = tail call float @llvm.pow.f32(float %mul, float %x)
+  %pow.2 = tail call float @llvm.pow.f32(float %pow.1, float %x)
+  store float %pow.2, ptr %gep.C, align 4
+  br label %loop.latch
+
+loop.latch:
+  store float 0.000000e+00, ptr %E, align 4
+  %iv.next = add i64 %iv, 1
+  %dec.iv.next = add i64 %dec.iv, -1
+  %ec = icmp ne i64 %dec.iv.next, 0
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
+; CHECK-LABEL: @avx512_cond_load_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
+; CHECK-NEXT:    [[EXT:%.*]] = sext i32 [[OR]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[RES_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %c.1 = icmp slt i32 %iv, 0
+  br i1 %c.1, label %if.then, label %loop.latch
+
+if.then:
+  %1 = urem i32 %a, %c
+  %mul = sub i32 0, %1
+  %div = udiv i32 %c, %d
+  %or = or i32 %div, %mul
+  %ext = sext i32 %or to i64
+  %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
+  %l = load i64, ptr %gep, align 8
+  %or.2 = or i64 %l, %b
+  br label %loop.latch
+
+loop.latch:
+  %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp ult i32 %iv, %c
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret i64 %res
+}
+
+define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
+; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP25]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
+; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK:       pred.store.if8:
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP29]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; CHECK:       pred.store.continue9:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
+; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; CHECK:       pred.store.if10:
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; CHECK:       pred.store.if12:
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; CHECK:       pred.store.continue13:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; CHECK:       pred.store.if14:
+; CHECK-NEXT:    [[TMP88:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP40]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; CHECK:       pred.store.continue15:
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
+; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; CHECK:       pred.store.if16:
+; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP44]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; CHECK:       pred.store.continue17:
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
+; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; CHECK:       pred.store.if18:
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP48]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; CHECK:       pred.store.continue19:
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
+; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; CHECK:       pred.store.if20:
+; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP52]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; CHECK:       pred.store.continue21:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
+; CHECK-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; CHECK:       pred.store.if22:
+; CHECK-NEXT:    [[TMP107:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP55]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; CHECK:       pred.store.continue23:
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; CHECK:       pred.store.if24:
+; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP59]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; CHECK:       pred.store.continue25:
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
+; CHECK-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; CHECK:       pred.store.if26:
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP63]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; CHECK:       pred.store.continue27:
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
+; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; CHECK:       pred.store.if28:
+; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP67]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; CHECK:       pred.store.continue29:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
+; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
+; CHECK:       pred.store.if30:
+; CHECK-NEXT:    [[TMP108:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP70]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; CHECK:       pred.store.continue31:
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
+; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
+; CHECK:       pred.store.if32:
+; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP74]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE33]]
+; CHECK:       pred.store.continue33:
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
+; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
+; CHECK:       pred.store.if34:
+; CHECK-NEXT:    [[TMP76:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP78]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE35]]
+; CHECK:       pred.store.continue35:
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
+; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
+; CHECK:       pred.store.if36:
+; CHECK-NEXT:    [[TMP80:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NEXT:    [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP82]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE37]]
+; CHECK:       pred.store.continue37:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]]
+; CHECK-NEXT:    [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2
+; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]]
+; CHECK-NEXT:    [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0
+; CHECK-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
+; CHECK:       pred.store.if43:
+; CHECK-NEXT:    [[TMP86:%.*]] = add i64 [[INDEX40]], 0
+; CHECK-NEXT:    [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2
+; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP93]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE44]]
+; CHECK:       pred.store.continue44:
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1
+; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
+; CHECK:       pred.store.if45:
+; CHECK-NEXT:    [[TMP95:%.*]] = add i64 [[INDEX40]], 1
+; CHECK-NEXT:    [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2
+; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP97]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE46]]
+; CHECK:       pred.store.continue46:
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2
+; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
+; CHECK:       pred.store.if47:
+; CHECK-NEXT:    [[TMP99:%.*]] = add i64 [[INDEX40]], 2
+; CHECK-NEXT:    [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2
+; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP101]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE48]]
+; CHECK:       pred.store.continue48:
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3
+; CHECK-NEXT:    br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]]
+; CHECK:       pred.store.if49:
+; CHECK-NEXT:    [[TMP103:%.*]] = add i64 [[INDEX40]], 3
+; CHECK-NEXT:    [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP105]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE50]]
+; CHECK:       pred.store.continue50:
+; CHECK-NEXT:    [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4
+; CHECK-NEXT:    [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]]
+; CHECK-NEXT:    br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.shl = shl nsw i64 %iv, 2
+  %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
+  %l = load double, ptr %gep.0, align 8
+  %c = fcmp oeq double %l, 0.000000e+00
+  br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+  %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
+  store double 0.000000e+00, ptr %gep.1, align 8
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; Test for https://github.com/llvm/llvm-project/issues/129236.
+define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
+; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i16 0, 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 0 to i32
+; CHECK-NEXT:    [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
+; CHECK-NEXT:    [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
+; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[REM_I]] = urem i32 -1, [[CONV_I]]
+; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1
+; CHECK-NEXT:    br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[P_2_LCSSA]]
+;
+entry:
+  %cmp.i = icmp eq i16 0, 0
+  %conv.i = sext i16 0 to i32
+  %conv5.i = sext i8 %a to i32
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %cmp.i, label %then, label %else
+
+then:
+  %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ]
+  %shr.i = ashr i32 %conv5.i, %p.1
+  %tobool6.not.i = icmp eq i32 %shr.i, 0
+  %sext.i = shl i32 %p.1, 24
+  %2 = ashr exact i32 %sext.i, 24
+  %3 = select i1 %tobool6.not.i, i32 %2, i32 0
+  br label %loop.latch
+
+else:
+  %rem.i = urem i32 -1, %conv.i
+  %cmp3.i = icmp sgt i32 %rem.i, 1
+  br i1 %cmp3.i, label %loop.latch, label %then
+
+loop.latch:
+  %p.2 = phi i32 [ 0, %else ], [ %3, %then ]
+  %iv.next = add i8 %iv, -1
+  %ec = icmp eq i8 %iv.next, 0
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i32 %p.2
+}
+
+attributes #0 = { "target-cpu"="znver4" }
+attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
+attributes #2 = { "target-cpu"="znver3" }

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 472aa0b5b716e..b4c33aa7a50bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -528,356 +528,6 @@ exit:
   ret i1 %any.of.next
 }
 
-define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
-; CHECK-LABEL: @avx512_cond_load_cost(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP0]]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
-; CHECK-NEXT:    [[EXT:%.*]] = sext i32 [[OR]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    ret i64 [[RES_LCSSA]]
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %c.1 = icmp slt i32 %iv, 0
-  br i1 %c.1, label %if.then, label %loop.latch
-
-if.then:
-  %1 = urem i32 %a, %c
-  %mul = sub i32 0, %1
-  %div = udiv i32 %c, %d
-  %or = or i32 %div, %mul
-  %ext = sext i32 %or to i64
-  %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
-  %l = load i64, ptr %gep, align 8
-  %or.2 = or i64 %l, %b
-  br label %loop.latch
-
-loop.latch:
-  %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
-  %iv.next = add i32 %iv, 1
-  %ec = icmp ult i32 %iv, %c
-  br i1 %ec, label %loop.header, label %exit
-
-exit:
-  ret i64 %res
-}
-
-define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
-; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP25]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
-; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
-; CHECK:       pred.store.if8:
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP29]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; CHECK:       pred.store.continue9:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
-; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK:       pred.store.if10:
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
-; CHECK:       pred.store.continue11:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
-; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
-; CHECK:       pred.store.if14:
-; CHECK-NEXT:    [[TMP88:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP40]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
-; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
-; CHECK:       pred.store.if16:
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP44]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE17]]
-; CHECK:       pred.store.continue17:
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
-; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
-; CHECK:       pred.store.if18:
-; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP48]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE19]]
-; CHECK:       pred.store.continue19:
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
-; CHECK:       pred.store.if20:
-; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP52]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE21]]
-; CHECK:       pred.store.continue21:
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; CHECK:       pred.store.if22:
-; CHECK-NEXT:    [[TMP107:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP55]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; CHECK:       pred.store.continue23:
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
-; CHECK-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; CHECK:       pred.store.if24:
-; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP59]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; CHECK:       pred.store.continue25:
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
-; CHECK-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; CHECK:       pred.store.if26:
-; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP63]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; CHECK:       pred.store.continue27:
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; CHECK:       pred.store.if28:
-; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP67]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; CHECK:       pred.store.continue29:
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
-; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; CHECK:       pred.store.if30:
-; CHECK-NEXT:    [[TMP108:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP70]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; CHECK:       pred.store.continue31:
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
-; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; CHECK:       pred.store.if32:
-; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[INDEX]], 13
-; CHECK-NEXT:    [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP74]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE33]]
-; CHECK:       pred.store.continue33:
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
-; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
-; CHECK:       pred.store.if34:
-; CHECK-NEXT:    [[TMP76:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
-; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP78]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE35]]
-; CHECK:       pred.store.continue35:
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
-; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
-; CHECK:       pred.store.if36:
-; CHECK-NEXT:    [[TMP80:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP82]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE37]]
-; CHECK:       pred.store.continue37:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0
-; CHECK-NEXT:    [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]]
-; CHECK-NEXT:    [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ]
-; CHECK-NEXT:    [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2
-; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]]
-; CHECK-NEXT:    [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0
-; CHECK-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
-; CHECK:       pred.store.if43:
-; CHECK-NEXT:    [[TMP86:%.*]] = add i64 [[INDEX40]], 0
-; CHECK-NEXT:    [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2
-; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP93]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE44]]
-; CHECK:       pred.store.continue44:
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1
-; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
-; CHECK:       pred.store.if45:
-; CHECK-NEXT:    [[TMP95:%.*]] = add i64 [[INDEX40]], 1
-; CHECK-NEXT:    [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2
-; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP97]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE46]]
-; CHECK:       pred.store.continue46:
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2
-; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
-; CHECK:       pred.store.if47:
-; CHECK-NEXT:    [[TMP99:%.*]] = add i64 [[INDEX40]], 2
-; CHECK-NEXT:    [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2
-; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP101]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE48]]
-; CHECK:       pred.store.continue48:
-; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3
-; CHECK-NEXT:    br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]]
-; CHECK:       pred.store.if49:
-; CHECK-NEXT:    [[TMP103:%.*]] = add i64 [[INDEX40]], 3
-; CHECK-NEXT:    [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2
-; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP105]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE50]]
-; CHECK:       pred.store.continue50:
-; CHECK-NEXT:    [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4
-; CHECK-NEXT:    [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]]
-; CHECK-NEXT:    br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
-; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_0]], align 8
-; CHECK-NEXT:    [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %iv.shl = shl nsw i64 %iv, 2
-  %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
-  %l = load double, ptr %gep.0, align 8
-  %c = fcmp oeq double %l, 0.000000e+00
-  br i1 %c, label %if.then, label %loop.latch
-
-if.then:
-  %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
-  store double 0.000000e+00, ptr %gep.1, align 8
-  br label %loop.latch
-
-loop.latch:
-  %iv.next = add nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv, %N
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret void
-}
-
 define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-LABEL: @cost_assume(
 ; CHECK-NEXT:  entry:
@@ -1317,72 +967,6 @@ exit:
   ret i32 %select.next
 }
 
-; Test for https://github.com/llvm/llvm-project/issues/129236.
-define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
-; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i16 0, 0
-; CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 0 to i32
-; CHECK-NEXT:    [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
-; CHECK-NEXT:    [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
-; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
-; CHECK-NEXT:    [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       else:
-; CHECK-NEXT:    [[REM_I]] = urem i32 -1, [[CONV_I]]
-; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1
-; CHECK-NEXT:    br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], -1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    ret i32 [[P_2_LCSSA]]
-;
-entry:
-  %cmp.i = icmp eq i16 0, 0
-  %conv.i = sext i16 0 to i32
-  %conv5.i = sext i8 %a to i32
-  br label %loop.header
-
-loop.header:
-  %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ]
-  br i1 %cmp.i, label %then, label %else
-
-then:
-  %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ]
-  %shr.i = ashr i32 %conv5.i, %p.1
-  %tobool6.not.i = icmp eq i32 %shr.i, 0
-  %sext.i = shl i32 %p.1, 24
-  %2 = ashr exact i32 %sext.i, 24
-  %3 = select i1 %tobool6.not.i, i32 %2, i32 0
-  br label %loop.latch
-
-else:
-  %rem.i = urem i32 -1, %conv.i
-  %cmp3.i = icmp sgt i32 %rem.i, 1
-  br i1 %cmp3.i, label %loop.latch, label %then
-
-loop.latch:
-  %p.2 = phi i32 [ 0, %else ], [ %3, %then ]
-  %iv.next = add i8 %iv, -1
-  %ec = icmp eq i8 %iv.next, 0
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret i32 %p.2
-}
-
 declare void @llvm.assume(i1 noundef) #0
 
 attributes #0 = { "target-cpu"="penryn" }


        


More information about the llvm-commits mailing list