[llvm] 8681bb8 - [LV] Add additional test coverage for cost modeling.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 26 02:18:25 PDT 2024
Author: Florian Hahn
Date: 2024-06-26T10:18:01+01:00
New Revision: 8681bb8bedf065abe34a4523fb58287fc05f7907
URL: https://github.com/llvm/llvm-project/commit/8681bb8bedf065abe34a4523fb58287fc05f7907
DIFF: https://github.com/llvm/llvm-project/commit/8681bb8bedf065abe34a4523fb58287fc05f7907.diff
LOG: [LV] Add additional test coverage for cost modeling.
Add missing tests uncovered by
https://github.com/llvm/llvm-project/pull/92555.
Includes test for https://github.com/llvm/llvm-project/issues/96294 and
https://github.com/llvm/llvm-project/issues/96328
Added:
llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll
llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg
Modified:
llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
llvm/test/Transforms/LoopVectorize/X86/optsize.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f5ad8bbb3e7e0..78452a9c884ee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1224,7 +1224,484 @@ exit:
ret void
}
+define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias %src.2, ptr noalias %src.3, ptr noalias %src.4, ptr noalias %dst, i64 %N) #2 {
+; DEFAULT-LABEL: define void @test_conditional_interleave_group(
+; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; DEFAULT-NEXT: entry:
+; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT: vector.scevcheck:
+; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; DEFAULT-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; DEFAULT-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; DEFAULT-NEXT: [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
+; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; DEFAULT-NEXT: [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; DEFAULT-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; DEFAULT-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; DEFAULT-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; DEFAULT-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
+; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
+; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
+; DEFAULT-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; DEFAULT-NEXT: [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; DEFAULT-NEXT: [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
+; DEFAULT-NEXT: [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
+; DEFAULT-NEXT: [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT6]]
+; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT6]]
+; DEFAULT-NEXT: [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[DST]]
+; DEFAULT-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
+; DEFAULT-NEXT: [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
+; DEFAULT-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
+; DEFAULT-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT: vector.ph:
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; DEFAULT: vector.body:
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
+; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
+; DEFAULT-NEXT: [[TMP15:%.*]] = load float, ptr [[SRC_1]], align 4
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC_2]], align 4
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP16]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP17:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP17]])
+; DEFAULT-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP20:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT11]], <8 x float> zeroinitializer, <8 x float> [[TMP18]])
+; DEFAULT-NEXT: [[TMP21:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x float> poison, float [[TMP21]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT12]], <8 x float> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp ogt <8 x float> [[TMP20]], [[BROADCAST_SPLAT13]]
+; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], <8 x i64> [[VEC_IND]]
+; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0
+; DEFAULT-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT: pred.store.if:
+; DEFAULT-NEXT: [[TMP25:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP25]], align 4
+; DEFAULT-NEXT: [[TMP26:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP27]], align 4
+; DEFAULT-NEXT: [[TMP28:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[TMP28]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP29]], align 4
+; DEFAULT-NEXT: [[TMP30:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 0
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP30]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE]]
+; DEFAULT: pred.store.continue:
+; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <8 x i1> [[TMP22]], i32 1
+; DEFAULT-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; DEFAULT: pred.store.if14:
+; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP32]], align 4
+; DEFAULT-NEXT: [[TMP33:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP33]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP34]], align 4
+; DEFAULT-NEXT: [[TMP35:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP35]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP36]], align 4
+; DEFAULT-NEXT: [[TMP37:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP37]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE15]]
+; DEFAULT: pred.store.continue15:
+; DEFAULT-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP22]], i32 2
+; DEFAULT-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; DEFAULT: pred.store.if16:
+; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP39]], align 4
+; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[TMP40]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP41]], align 4
+; DEFAULT-NEXT: [[TMP42:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP42]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP43]], align 4
+; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP44]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE17]]
+; DEFAULT: pred.store.continue17:
+; DEFAULT-NEXT: [[TMP45:%.*]] = extractelement <8 x i1> [[TMP22]], i32 3
+; DEFAULT-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; DEFAULT: pred.store.if18:
+; DEFAULT-NEXT: [[TMP46:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP46]], align 4
+; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP47]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP48]], align 4
+; DEFAULT-NEXT: [[TMP49:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[TMP49]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP50]], align 4
+; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP51]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE19]]
+; DEFAULT: pred.store.continue19:
+; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <8 x i1> [[TMP22]], i32 4
+; DEFAULT-NEXT: br i1 [[TMP52]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; DEFAULT: pred.store.if20:
+; DEFAULT-NEXT: [[TMP53:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP53]], align 4
+; DEFAULT-NEXT: [[TMP54:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[TMP54]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP55]], align 4
+; DEFAULT-NEXT: [[TMP56:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[TMP56]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP57]], align 4
+; DEFAULT-NEXT: [[TMP58:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP58]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE21]]
+; DEFAULT: pred.store.continue21:
+; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <8 x i1> [[TMP22]], i32 5
+; DEFAULT-NEXT: br i1 [[TMP59]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; DEFAULT: pred.store.if22:
+; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP60]], align 4
+; DEFAULT-NEXT: [[TMP61:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[TMP61]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP62]], align 4
+; DEFAULT-NEXT: [[TMP63:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[TMP63]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP64]], align 4
+; DEFAULT-NEXT: [[TMP65:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP65]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE23]]
+; DEFAULT: pred.store.continue23:
+; DEFAULT-NEXT: [[TMP66:%.*]] = extractelement <8 x i1> [[TMP22]], i32 6
+; DEFAULT-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; DEFAULT: pred.store.if24:
+; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP67]], align 4
+; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr [[TMP68]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP69]], align 4
+; DEFAULT-NEXT: [[TMP70:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr i8, ptr [[TMP70]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP71]], align 4
+; DEFAULT-NEXT: [[TMP72:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP72]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE25]]
+; DEFAULT: pred.store.continue25:
+; DEFAULT-NEXT: [[TMP73:%.*]] = extractelement <8 x i1> [[TMP22]], i32 7
+; DEFAULT-NEXT: br i1 [[TMP73]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
+; DEFAULT: pred.store.if26:
+; DEFAULT-NEXT: [[TMP74:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP74]], align 4
+; DEFAULT-NEXT: [[TMP75:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr i8, ptr [[TMP75]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP76]], align 4
+; DEFAULT-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT: [[TMP78:%.*]] = getelementptr i8, ptr [[TMP77]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4
+; DEFAULT-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP79]], align 4
+; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; DEFAULT: pred.store.continue27:
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; DEFAULT-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; DEFAULT: middle.block:
+; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT: scalar.ph:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT: br label [[LOOP_HEADER:%.*]]
+; DEFAULT: loop.header:
+; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; DEFAULT-NEXT: [[TMP81:%.*]] = load float, ptr [[SRC_1]], align 4
+; DEFAULT-NEXT: [[TMP82:%.*]] = load float, ptr [[SRC_2]], align 4
+; DEFAULT-NEXT: [[MUL8_I_US:%.*]] = fmul float [[TMP82]], 0.000000e+00
+; DEFAULT-NEXT: [[TMP83:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP81]], float 0.000000e+00, float [[MUL8_I_US]])
+; DEFAULT-NEXT: [[TMP84:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT: [[TMP85:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP84]], float 0.000000e+00, float [[TMP83]])
+; DEFAULT-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_3]], align 4
+; DEFAULT-NEXT: [[C:%.*]] = fcmp ogt float [[TMP85]], [[TMP86]]
+; DEFAULT-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; DEFAULT: if.then:
+; DEFAULT-NEXT: [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4
+; DEFAULT-NEXT: [[DST_1:%.*]] = getelementptr i8, ptr [[DST_0]], i64 4
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[DST_1]], align 4
+; DEFAULT-NEXT: [[DST_2:%.*]] = getelementptr i8, ptr [[DST_0]], i64 8
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[DST_2]], align 4
+; DEFAULT-NEXT: [[DST_3:%.*]] = getelementptr i8, ptr [[DST_0]], i64 16
+; DEFAULT-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4
+; DEFAULT-NEXT: br label [[LOOP_LATCH]]
+; DEFAULT: loop.latch:
+; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; DEFAULT: exit:
+; DEFAULT-NEXT: ret void
+;
+; PRED-LABEL: define void @test_conditional_interleave_group(
+; PRED-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRED-NEXT: entry:
+; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED: vector.scevcheck:
+; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; PRED-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; PRED-NEXT: [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
+; PRED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; PRED-NEXT: [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; PRED-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
+; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; PRED-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; PRED-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; PRED-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
+; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
+; PRED-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
+; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; PRED-NEXT: [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
+; PRED-NEXT: [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
+; PRED-NEXT: [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
+; PRED-NEXT: [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT6]]
+; PRED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT6]]
+; PRED-NEXT: [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[DST]]
+; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
+; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
+; PRED-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
+; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED: vector.ph:
+; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], 8
+; PRED-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8
+; PRED-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
+; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
+; PRED: vector.body:
+; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
+; PRED-NEXT: [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
+; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
+; PRED-NEXT: [[TMP22:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP22]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT: [[TMP23:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT11]], <8 x float> zeroinitializer, <8 x float> [[TMP21]])
+; PRED-NEXT: [[TMP24:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x float> poison, float [[TMP24]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT12]], <8 x float> poison, <8 x i32> zeroinitializer
+; PRED-NEXT: [[TMP25:%.*]] = fcmp ogt <8 x float> [[TMP23]], [[BROADCAST_SPLAT13]]
+; PRED-NEXT: [[TMP26:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i1> [[TMP25]], <8 x i1> zeroinitializer
+; PRED-NEXT: [[TMP27:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], <8 x i64> [[VEC_IND]]
+; PRED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP26]], i32 0
+; PRED-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED: pred.store.if:
+; PRED-NEXT: [[TMP29:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP29]], align 4
+; PRED-NEXT: [[TMP30:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[TMP30]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP31]], align 4
+; PRED-NEXT: [[TMP32:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP33]], align 4
+; PRED-NEXT: [[TMP34:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 0
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP34]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
+; PRED: pred.store.continue:
+; PRED-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP26]], i32 1
+; PRED-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; PRED: pred.store.if14:
+; PRED-NEXT: [[TMP36:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP36]], align 4
+; PRED-NEXT: [[TMP37:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[TMP37]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP38]], align 4
+; PRED-NEXT: [[TMP39:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[TMP39]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP40]], align 4
+; PRED-NEXT: [[TMP41:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 1
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP41]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE15]]
+; PRED: pred.store.continue15:
+; PRED-NEXT: [[TMP42:%.*]] = extractelement <8 x i1> [[TMP26]], i32 2
+; PRED-NEXT: br i1 [[TMP42]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; PRED: pred.store.if16:
+; PRED-NEXT: [[TMP43:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP43]], align 4
+; PRED-NEXT: [[TMP44:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[TMP44]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP45]], align 4
+; PRED-NEXT: [[TMP46:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[TMP46]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP47]], align 4
+; PRED-NEXT: [[TMP48:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 2
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP48]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE17]]
+; PRED: pred.store.continue17:
+; PRED-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[TMP26]], i32 3
+; PRED-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; PRED: pred.store.if18:
+; PRED-NEXT: [[TMP50:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP50]], align 4
+; PRED-NEXT: [[TMP51:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[TMP51]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP52]], align 4
+; PRED-NEXT: [[TMP53:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[TMP53]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP54]], align 4
+; PRED-NEXT: [[TMP55:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 3
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP55]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE19]]
+; PRED: pred.store.continue19:
+; PRED-NEXT: [[TMP56:%.*]] = extractelement <8 x i1> [[TMP26]], i32 4
+; PRED-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; PRED: pred.store.if20:
+; PRED-NEXT: [[TMP57:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP57]], align 4
+; PRED-NEXT: [[TMP58:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[TMP58]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP59]], align 4
+; PRED-NEXT: [[TMP60:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[TMP60]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP61]], align 4
+; PRED-NEXT: [[TMP62:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP62]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE21]]
+; PRED: pred.store.continue21:
+; PRED-NEXT: [[TMP63:%.*]] = extractelement <8 x i1> [[TMP26]], i32 5
+; PRED-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; PRED: pred.store.if22:
+; PRED-NEXT: [[TMP64:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP64]], align 4
+; PRED-NEXT: [[TMP65:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[TMP65]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP66]], align 4
+; PRED-NEXT: [[TMP67:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[TMP67]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP68]], align 4
+; PRED-NEXT: [[TMP69:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 5
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP69]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE23]]
+; PRED: pred.store.continue23:
+; PRED-NEXT: [[TMP70:%.*]] = extractelement <8 x i1> [[TMP26]], i32 6
+; PRED-NEXT: br i1 [[TMP70]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; PRED: pred.store.if24:
+; PRED-NEXT: [[TMP71:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP71]], align 4
+; PRED-NEXT: [[TMP72:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT: [[TMP73:%.*]] = getelementptr i8, ptr [[TMP72]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP73]], align 4
+; PRED-NEXT: [[TMP74:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT: [[TMP75:%.*]] = getelementptr i8, ptr [[TMP74]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP75]], align 4
+; PRED-NEXT: [[TMP76:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 6
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP76]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE25]]
+; PRED: pred.store.continue25:
+; PRED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP26]], i32 7
+; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
+; PRED: pred.store.if26:
+; PRED-NEXT: [[TMP78:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4
+; PRED-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT: [[TMP80:%.*]] = getelementptr i8, ptr [[TMP79]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP80]], align 4
+; PRED-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr [[TMP81]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP82]], align 4
+; PRED-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
+; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4
+; PRED-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; PRED: pred.store.continue27:
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
+; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0
+; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; PRED: middle.block:
+; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED: scalar.ph:
+; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
+; PRED: loop.header:
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4
+; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4
+; PRED-NEXT: [[MUL8_I_US:%.*]] = fmul float [[TMP87]], 0.000000e+00
+; PRED-NEXT: [[TMP88:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP86]], float 0.000000e+00, float [[MUL8_I_US]])
+; PRED-NEXT: [[TMP89:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT: [[TMP90:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP89]], float 0.000000e+00, float [[TMP88]])
+; PRED-NEXT: [[TMP91:%.*]] = load float, ptr [[SRC_3]], align 4
+; PRED-NEXT: [[C:%.*]] = fcmp ogt float [[TMP90]], [[TMP91]]
+; PRED-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; PRED: if.then:
+; PRED-NEXT: [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4
+; PRED-NEXT: [[DST_1:%.*]] = getelementptr i8, ptr [[DST_0]], i64 4
+; PRED-NEXT: store float 0.000000e+00, ptr [[DST_1]], align 4
+; PRED-NEXT: [[DST_2:%.*]] = getelementptr i8, ptr [[DST_0]], i64 8
+; PRED-NEXT: store float 0.000000e+00, ptr [[DST_2]], align 4
+; PRED-NEXT: [[DST_3:%.*]] = getelementptr i8, ptr [[DST_0]], i64 16
+; PRED-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4
+; PRED-NEXT: br label [[LOOP_LATCH]]
+; PRED: loop.latch:
+; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
+; PRED: exit:
+; PRED-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %0 = load float, ptr %src.1, align 4
+ %1 = load float, ptr %src.2, align 4
+ %mul8.i.us = fmul float %1, 0.000000e+00
+ %2 = tail call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float %mul8.i.us)
+ %3 = load float, ptr %src.3, align 4
+ %4 = tail call float @llvm.fmuladd.f32(float %3, float 0.000000e+00, float %2)
+ %5 = load float, ptr %src.3, align 4
+ %c = fcmp ogt float %4, %5
+ br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+ %dst.0 = getelementptr { [4 x float] }, ptr %dst, i64 %iv
+ store float 0.000000e+00, ptr %dst.0, align 4
+ %dst.1 = getelementptr i8, ptr %dst.0, i64 4
+ store float 0.000000e+00, ptr %dst.1, align 4
+ %dst.2 = getelementptr i8, ptr %dst.0, i64 8
+ store float 0.000000e+00, ptr %dst.2, align 4
+ %dst.3 = getelementptr i8, ptr %dst.0, i64 16
+ store float 0.000000e+00, ptr %dst.0, align 4
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, %N
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
attributes #1 = { "target-cpu"="neoverse-512tvb" }
+attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
;.
; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -1255,6 +1732,8 @@ attributes #1 = { "target-cpu"="neoverse-512tvb" }
; DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
; DEFAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]]}
;.
; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1281,4 +1760,6 @@ attributes #1 = { "target-cpu"="neoverse-512tvb" }
; PRED: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
; PRED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
; PRED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
+; PRED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
+; PRED: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
new file mode 100644
index 0000000000000..99a7d1d34f26d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mattr=+v -passes=loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown-elf"
+
+define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
+; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> <i1 true, i1 true>), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> <i1 true, i1 true>), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT: store i32 [[DOTPRE]], ptr [[DST]], align 4
+; CHECK-NEXT: [[TMP3]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[TMP3]], 100
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %0 = phi i64 [ 0, %entry ], [ %1, %loop ]
+ %.pre = load i32, ptr %src, align 4
+ store i32 %.pre, ptr %dst, align 4
+ %1 = add nuw i64 %0, 1
+ %ec = icmp eq i64 %1, 100
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll
new file mode 100644
index 0000000000000..785af1551dd28
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/induction-branch-cost.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-f128:64-n32:64-S128-ni:1:10:20"
+target triple = "wasm32-unknown-emscripten"
+
+define void @induction_phi_and_branch_cost(ptr %end, ptr %start.1, ptr %start.2) #0 {
+; CHECK-LABEL: define void @induction_phi_and_branch_cost(
+; CHECK-SAME: ptr [[END:%.*]], ptr [[START_1:%.*]], ptr [[START_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[END2:%.*]] = ptrtoint ptr [[END]] to i32
+; CHECK-NEXT: [[START_11:%.*]] = ptrtoint ptr [[START_1]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[START_11]], [[END2]]
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[N_VEC]], -4
+; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START_1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[N_VEC]], -4
+; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i32 [[TMP4]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], -4
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_2]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -3
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP7]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START_1]], %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], %[[MIDDLE_BLOCK]] ], [ [[START_2]], %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[PTR_IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i32 -4
+; CHECK-NEXT: [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i32 -4
+; CHECK-NEXT: store i32 0, ptr [[PTR_IV_2]], align 4
+; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %ptr.iv = phi ptr [ %start.1, %entry ], [ %ptr.iv.next, %loop ]
+ %ptr.iv.2 = phi ptr [ %start.2, %entry ], [ %ptr.iv.2.next, %loop ]
+ %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i32 -4
+ %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i32 -4
+ store i32 0, ptr %ptr.iv.2, align 4
+ %ec = icmp eq ptr %ptr.iv, %end
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+simd128" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg
new file mode 100644
index 0000000000000..d5f39ab4dbc8c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "WebAssembly" in config.root.targets:
+ config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index b067766b54357..eedabde5fc1a1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -479,4 +479,388 @@ exit:
ret i1 %any.of.next
}
+define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
+; CHECK-LABEL: @avx512_cond_load_cost(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 [[IV]], 0
+; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT: [[MUL:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
+; CHECK-NEXT: [[EXT:%.*]] = sext i32 [[OR]] to i64
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
+; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT: [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: loop.latch:
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
+; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
+; CHECK-NEXT: ret i64 [[RES_LCSSA]]
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %c.1 = icmp slt i32 %iv, 0
+ br i1 %c.1, label %if.then, label %loop.latch
+
+if.then:
+ %1 = urem i32 %a, %c
+ %mul = sub i32 0, %1
+ %div = udiv i32 %c, %d
+ %or = or i32 %div, %mul
+ %ext = sext i32 %or to i64
+ %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
+ %l = load i64, ptr %gep, align 8
+ %or.2 = or i64 %l, %b
+ br label %loop.latch
+
+loop.latch:
+ %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
+ %iv.next = add i32 %iv, 1
+ %ec = icmp ult i32 %iv, %c
+ br i1 %ec, label %loop.header, label %exit
+
+exit:
+ ret i64 %res
+}
+
+define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
+; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP15]], align 8
+; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8
+; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
+; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP25]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
+; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK: pred.store.if7:
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
+; CHECK: pred.store.continue8:
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
+; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK: pred.store.if9:
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; CHECK: pred.store.continue10:
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
+; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK: pred.store.if11:
+; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; CHECK: pred.store.continue12:
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK: pred.store.if13:
+; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]]
+; CHECK: pred.store.continue14:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
+; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK: pred.store.if15:
+; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]]
+; CHECK: pred.store.continue16:
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
+; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK: pred.store.if17:
+; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]]
+; CHECK: pred.store.continue18:
+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
+; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK: pred.store.if19:
+; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
+; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]]
+; CHECK: pred.store.continue20:
+; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
+; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK: pred.store.if21:
+; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]]
+; CHECK: pred.store.continue22:
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
+; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK: pred.store.if23:
+; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
+; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]]
+; CHECK: pred.store.continue24:
+; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
+; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK: pred.store.if25:
+; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]]
+; CHECK: pred.store.continue26:
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
+; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK: pred.store.if27:
+; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]]
+; CHECK: pred.store.continue28:
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
+; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK: pred.store.if29:
+; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]]
+; CHECK: pred.store.continue30:
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
+; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; CHECK: pred.store.if31:
+; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]]
+; CHECK: pred.store.continue32:
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
+; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; CHECK: pred.store.if33:
+; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
+; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE34]]
+; CHECK: pred.store.continue34:
+; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
+; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]]
+; CHECK: pred.store.if35:
+; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
+; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE36]]
+; CHECK: pred.store.continue36:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
+; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8
+; CHECK-NEXT: [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
+; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK: if.then:
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: loop.latch:
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %iv.shl = shl nsw i64 %iv, 2
+ %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
+ %l = load double, ptr %gep.0, align 8
+ %c = fcmp oeq double %l, 0.000000e+00
+ br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+ %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
+ store double 0.000000e+00, ptr %gep.1, align 8
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv, %N
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
+
+define i64 @cost_assume(ptr %end, i64 %N) {
+; CHECK-LABEL: @cost_assume(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END:%.*]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -9
+; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 9
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3]] = add <2 x i64> [[VEC_PHI]], <i64 1, i64 1>
+; CHECK-NEXT: [[TMP4]] = add <2 x i64> [[VEC_PHI2]], <i64 1, i64 1>
+; CHECK-NEXT: [[TMP5]] = add <2 x i64> [[VEC_PHI3]], <i64 1, i64 1>
+; CHECK-NEXT: [[TMP6]] = add <2 x i64> [[VEC_PHI4]], <i64 1, i64 1>
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]])
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP16]])
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP17]])
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP5]], [[BIN_RDX]]
+; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <2 x i64> [[TMP6]], [[BIN_RDX5]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX6]])
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP22]] = add i64 [[TMP21]], 1
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[N]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[C]])
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]]
+; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP22]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i64 [[DOTLCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %0 = phi i64 [ 0, %entry ], [ %1, %loop ]
+ %1 = add i64 %0, 1
+ %iv.next = add nsw i64 %iv, 1
+ %c = icmp ne i64 %N, 0
+ tail call void @llvm.assume(i1 %c)
+ %gep = getelementptr nusw [ 9 x i8 ], ptr null, i64 %iv.next
+ %ec = icmp eq ptr %gep, %end
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i64 %1
+}
+
+declare void @llvm.assume(i1 noundef) #0
+
attributes #0 = { "target-cpu"="penryn" }
+attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
+attributes #2 = { "target-cpu"="znver3" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index ce6dd52d54557..3226f72d51d2e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -45,7 +45,7 @@ define i32 @foo_optsize() #0 {
; CHECK-NEXT: store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
@@ -84,7 +84,7 @@ define i32 @foo_optsize() #0 {
; AUTOVF-NEXT: store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; AUTOVF: for.end:
; AUTOVF-NEXT: ret i32 0
;
@@ -353,3 +353,112 @@ for.cond:
while.cond.loopexit:
ret i32 0
}
+
+define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
+; CHECK-LABEL: @tail_folded_store_avx512(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[END2:%.*]] = ptrtoint ptr [[END:%.*]] to i32
+; CHECK-NEXT: [[START1:%.*]] = ptrtoint ptr [[START:%.*]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[START1]], -72
+; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[END2]]
+; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 72
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP3]], 63
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
+; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <64 x i32> <i32 0, i32 -72, i32 -144, i32 -216, i32 -288, i32 -360, i32 -432, i32 -504, i32 -576, i32 -648, i32 -720, i32 -792, i32 -864, i32 -936, i32 -1008, i32 -1080, i32 -1152, i32 -1224, i32 -1296, i32 -1368, i32 -1440, i32 -1512, i32 -1584, i32 -1656, i32 -1728, i32 -1800, i32 -1872, i32 -1944, i32 -2016, i32 -2088, i32 -2160, i32 -2232, i32 -2304, i32 -2376, i32 -2448, i32 -2520, i32 -2592, i32 -2664, i32 -2736, i32 -2808, i32 -2880, i32 -2952, i32 -3024, i32 -3096, i32 -3168, i32 -3240, i32 -3312, i32 -3384, i32 -3456, i32 -3528, i32 -3600, i32 -3672, i32 -3744, i32 -3816, i32 -3888, i32 -3960, i32 -4032, i32 -4104, i32 -4176, i32 -4248, i32 -4320, i32 -4392, i32 -4464, i32 -4536>
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <64 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v64p0.v64p0(<64 x ptr> zeroinitializer, <64 x ptr> [[TMP5]], i32 8, <64 x i1> [[TMP6]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -4608
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 -72
+; CHECK-NEXT: store ptr null, ptr [[PTR_IV]], align 8
+; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+; AUTOVF-LABEL: @tail_folded_store_avx512(
+; AUTOVF-NEXT: entry:
+; AUTOVF-NEXT: [[END2:%.*]] = ptrtoint ptr [[END:%.*]] to i32
+; AUTOVF-NEXT: [[START1:%.*]] = ptrtoint ptr [[START:%.*]] to i32
+; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[START1]], -72
+; AUTOVF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[END2]]
+; AUTOVF-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 72
+; AUTOVF-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
+; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AUTOVF: vector.ph:
+; AUTOVF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP3]], 7
+; AUTOVF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
+; AUTOVF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; AUTOVF-NEXT: [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
+; AUTOVF-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
+; AUTOVF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
+; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; AUTOVF-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer
+; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]]
+; AUTOVF: vector.body:
+; AUTOVF-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AUTOVF-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <8 x i32> <i32 0, i32 -72, i32 -144, i32 -216, i32 -288, i32 -360, i32 -432, i32 -504>
+; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INDEX]], i64 0
+; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
+; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AUTOVF-NEXT: [[TMP6:%.*]] = icmp ule <8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; AUTOVF-NEXT: call void @llvm.masked.scatter.v8p0.v8p0(<8 x ptr> zeroinitializer, <8 x ptr> [[TMP5]], i32 8, <8 x i1> [[TMP6]])
+; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; AUTOVF-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -576
+; AUTOVF-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; AUTOVF-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AUTOVF: middle.block:
+; AUTOVF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; AUTOVF: scalar.ph:
+; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; AUTOVF-NEXT: br label [[LOOP:%.*]]
+; AUTOVF: loop:
+; AUTOVF-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; AUTOVF-NEXT: [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 -72
+; AUTOVF-NEXT: store ptr null, ptr [[PTR_IV]], align 8
+; AUTOVF-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; AUTOVF-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; AUTOVF: exit:
+; AUTOVF-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+ %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 -72
+ store ptr null, ptr %ptr.iv, align 8
+ %ec = icmp eq ptr %ptr.iv.next, %end
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+attributes #3 = { optsize "target-cpu"="skylake-avx512" }
More information about the llvm-commits
mailing list