[llvm] 872f700 - Revert "[NFCI] Regenerate SROA/LoopVectorize test checks"

Dávid Bolvanský via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 3 16:16:31 PDT 2022


Author: Dávid Bolvanský
Date: 2022-04-04T01:15:30+02:00
New Revision: 872f7000fc6a2945c715dd701984560b0a98105c

URL: https://github.com/llvm/llvm-project/commit/872f7000fc6a2945c715dd701984560b0a98105c
DIFF: https://github.com/llvm/llvm-project/commit/872f7000fc6a2945c715dd701984560b0a98105c.diff

LOG: Revert "[NFCI] Regenerate SROA/LoopVectorize test checks"

This reverts commit 14e3450fb57305aa9ff3e9e60687b458e43835c9.

Added: 
    

Modified: 
    llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
    llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
    llvm/test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll
    llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
    llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
    llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
    llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
    llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
    llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
    llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
    llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
    llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
    llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll
    llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
    llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
    llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
    llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-type-conv.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
    llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
    llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
    llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
    llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
    llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/massv-altivec.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/massv-nobuiltin.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/massv-unsupported.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
    llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
    llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
    llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
    llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
    llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
    llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
    llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
    llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
    llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
    llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
    llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
    llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
    llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
    llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
    llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
    llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
    llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
    llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
    llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
    llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
    llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
    llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
    llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
    llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
    llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
    llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
    llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
    llvm/test/Transforms/LoopVectorize/X86/optsize.ll
    llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
    llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
    llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
    llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
    llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
    llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
    llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
    llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
    llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
    llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
    llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
    llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
    llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
    llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
    llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
    llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
    llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
    llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
    llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
    llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
    llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
    llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
    llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
    llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
    llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
    llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
    llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
    llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
    llvm/test/Transforms/LoopVectorize/alias-set-with-uncomputable-bounds.ll
    llvm/test/Transforms/LoopVectorize/assume.ll
    llvm/test/Transforms/LoopVectorize/check-prof-info.ll
    llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
    llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
    llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
    llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
    llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
    llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
    llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
    llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
    llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
    llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
    llvm/test/Transforms/LoopVectorize/float-induction.ll
    llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
    llvm/test/Transforms/LoopVectorize/fneg.ll
    llvm/test/Transforms/LoopVectorize/i8-induction.ll
    llvm/test/Transforms/LoopVectorize/if-conv-crash.ll
    llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
    llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
    llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
    llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
    llvm/test/Transforms/LoopVectorize/induction-ptrcasts.ll
    llvm/test/Transforms/LoopVectorize/induction.ll
    llvm/test/Transforms/LoopVectorize/induction_plus.ll
    llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
    llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
    llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
    llvm/test/Transforms/LoopVectorize/lcssa-crash.ll
    llvm/test/Transforms/LoopVectorize/loop-form.ll
    llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll
    llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
    llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll
    llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll
    llvm/test/Transforms/LoopVectorize/no_array_bounds.ll
    llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll
    llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll
    llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
    llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
    llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
    llvm/test/Transforms/LoopVectorize/pointer-induction.ll
    llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
    llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll
    llvm/test/Transforms/LoopVectorize/pr30806.ll
    llvm/test/Transforms/LoopVectorize/pr35743.ll
    llvm/test/Transforms/LoopVectorize/pr35773.ll
    llvm/test/Transforms/LoopVectorize/pr37515.ll
    llvm/test/Transforms/LoopVectorize/pr38697.ll
    llvm/test/Transforms/LoopVectorize/pr38800.ll
    llvm/test/Transforms/LoopVectorize/pr39099.ll
    llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
    llvm/test/Transforms/LoopVectorize/pr45259.ll
    llvm/test/Transforms/LoopVectorize/pr45525.ll
    llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
    llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
    llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
    llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
    llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
    llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
    llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-pointer-element-type.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
    llvm/test/Transforms/LoopVectorize/scalable-assume.ll
    llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
    llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
    llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
    llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
    llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
    llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
    llvm/test/Transforms/LoopVectorize/select-reduction.ll
    llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
    llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll
    llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
    llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
    llvm/test/Transforms/LoopVectorize/tripcount.ll
    llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
    llvm/test/Transforms/LoopVectorize/uniform-blend.ll
    llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
    llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll
    llvm/test/Transforms/LoopVectorize/vectorizeVFone.ll
    llvm/test/Transforms/LoopVectorize/vplan-outer-loop-uncomputable-trip-count.ll
    llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
    llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
    llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
    llvm/test/Transforms/SROA/address-spaces.ll
    llvm/test/Transforms/SROA/alignment.ll
    llvm/test/Transforms/SROA/big-endian.ll
    llvm/test/Transforms/SROA/dbg-inline.ll
    llvm/test/Transforms/SROA/dbg-single-piece.ll
    llvm/test/Transforms/SROA/dead-inst.ll
    llvm/test/Transforms/SROA/fca.ll
    llvm/test/Transforms/SROA/preserve-nonnull.ll
    llvm/test/Transforms/SROA/slice-order-independence.ll
    llvm/test/Transforms/SROA/vector-conversion.ll
    llvm/test/Transforms/SROA/vector-promotion-different-size.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 66edc6b42206f..b3eae690423d9 100644
--- a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -1,30 +1,29 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce
 
 ; Check that we don't fall into an infinite loop.
 define void @test() nounwind {
 entry:
-  br label %for.body
+ br label %for.body
 
 for.body:
-  %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
-  br label %for.body
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
 }
 
 
 
 define void @test2() nounwind {
 entry:
-  br label %for.body
+ br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
-  %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
-  %indvars.iv.next48 = add i64 %indvars.iv47, 1
-  br i1 undef, label %for.end, label %for.body
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  unreachable
+ unreachable
 }
 
 ;PR14701

diff  --git a/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
index ef102dba61db4..baf96b84a34f5 100644
--- a/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -1,5 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s  -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4 
 
 ; Check that we don't crash.
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll
index ea45ebf7f9598..9dff3dffff2bb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -Oz -S -enable-new-pm=0  < %s | FileCheck %s
 ; RUN: opt -passes='default<Oz>' -S < %s | FileCheck %s
 
@@ -11,57 +10,7 @@ target triple = "arm64-apple-ios5.0.0"
 
 define void @foo(float* noalias nocapture %ptrA, float* noalias nocapture readonly %ptrB, i64 %size) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER6:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SIZE]], -8
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[PTRB:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[PTRA:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER6]]
-; CHECK:       for.body.preheader6:
-; CHECK-NEXT:    [[INDVARS_IV2_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV2_PH]], [[FOR_BODY_PREHEADER6]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTRB]], i64 [[INDVARS_IV2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[PTRA]], i64 [[INDVARS_IV2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    store float [[MUL3]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK: fmul <4 x float>
 ;
 entry:
   br label %for.cond

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
index 8354251a46d57..a689f44e912c2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
@@ -1,80 +1,18 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
 ; Function Attrs: nounwind
 define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-; CHECK-LABEL: @array_add(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32* [[C]]
-;
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
 entry:
   %cmp10 = icmp sgt i32 %size, 0
   br i1 %cmp10, label %for.body.preheader, label %for.end

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
index 8d6ccc920c628..395b468c509cc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
@@ -1,80 +1,18 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
 ; Function Attrs: nounwind
 define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-; CHECK-LABEL: @array_add(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32* [[C]]
-;
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
 entry:
   %cmp10 = icmp sgt i32 %size, 0
   br i1 %cmp10, label %for.body.preheader, label %for.end

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index ff80332063550..3eb0eef96f9f9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
 
 ; This test currently fails when the LV calculates a maximums safe
@@ -16,47 +15,6 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 
 define void @f1(i32* %A) #0 {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 1024)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
index aeb70b933c330..371d209bafffe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
 
 ; Test cases for extending the vectorization factor, if small memory operations
@@ -9,63 +8,7 @@
 ; load 4 x i8, vectorization might still be profitable.
 define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[OFF:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[OFF]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP9]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i8, i8* [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[LV_EXT:%.*]] = zext i8 [[LV]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV_EXT]], [[OFF]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK:       <4 x i8>
 ;
 entry:
   br label %loop
@@ -89,63 +32,7 @@ exit:
 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
 define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[OFF:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[OFF]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP10]], <4 x i8>* [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP11]], <4 x i8>* [[TMP17]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV]], [[OFF]]
-; CHECK-NEXT:    [[ADD_TRUNC:%.*]] = trunc i32 [[ADD]] to i8
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[ADD_TRUNC]], i8* [[GEP_DST]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK:     <4 x i8>
 ;
 entry:
   br label %loop
@@ -169,66 +56,8 @@ exit:
 ; All memory operations use i32, all memory operations are profitable with VF 4.
 define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i32_store_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[OFF:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i8> poison, i8 [[OFF]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT2]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[WIDE_LOAD]] to <4 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[WIDE_LOAD1]] to <4 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i8> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP9]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 4
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP19]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[LV_TRUNC:%.*]] = trunc i32 [[LV]] to i8
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LV_TRUNC]], [[OFF]]
-; CHECK-NEXT:    [[ADD_EXT:%.*]] = zext i8 [[ADD]] to i32
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[ADD_EXT]], i32* [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK: vector.body:
+; CHECK:   <4 x i32>
 ;
 entry:
   br label %loop
@@ -254,91 +83,8 @@ exit:
 ; vectorization factor is large. Make sure the register estimates limit the
 ; vectorization factor.
 define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
-; CHECK-LABEL: @test_load_i8_store_i64_large(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[OFF:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[OFF_2:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[DST_3:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[TMP3]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[DST_5:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP3]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[WIDE_LOAD2]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST_2:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i64> [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[DST_4:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64* [[TMP18]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i64* [[TMP20]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i64* [[TMP3]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, i64* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i64* [[TMP25]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP26]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_DST_3:%.*]] = getelementptr inbounds i64, i64* [[DST_3]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_DST_3:%.*]] = load i64, i64* [[GEP_DST_3]], align 1
-; CHECK-NEXT:    [[GEP_DST_5:%.*]] = getelementptr inbounds i64, i64* [[DST_5]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_DST_5:%.*]] = load i64, i64* [[GEP_DST_3]], align 1
-; CHECK-NEXT:    [[LV:%.*]] = load i8, i8* [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[LV_EXT:%.*]] = zext i8 [[LV]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_EXT]], [[OFF]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD]], [[OFF_2]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr inbounds i64, i64* [[DST_2]], i64 [[IV]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i64 [[ADD_2]], [[LV_DST_3]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = add i64 [[ADD_3]], [[ADD]]
-; CHECK-NEXT:    [[GEP_DST_4:%.*]] = getelementptr inbounds i64, i64* [[DST_4]], i64 [[IV]]
-; CHECK-NEXT:    [[ADD_5:%.*]] = add i64 [[ADD_2]], [[LV_DST_5]]
-; CHECK-NEXT:    store i64 [[ADD_2]], i64* [[GEP_DST_2]], align 4
-; CHECK-NEXT:    store i64 [[ADD]], i64* [[GEP_DST]], align 4
-; CHECK-NEXT:    store i64 [[ADD_3]], i64* [[GEP_DST_3]], align 4
-; CHECK-NEXT:    store i64 [[ADD_4]], i64* [[GEP_DST_4]], align 4
-; CHECK-NEXT:    store i64 [[ADD_5]], i64* [[GEP_DST_5]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test_load_i8_store_i64_large
+; CHECK: <2 x i64>
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index 6481bb6031feb..6e28994cacdca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF1
 ; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF2
 
@@ -20,194 +19,13 @@
 ; }
 ;
 define i32 @PR33613(double* %b, double %j, i32 %d) #0 {
-; CHECK-VF4UF1-LABEL: @PR33613(
-; CHECK-VF4UF1-NEXT:  entry:
-; CHECK-VF4UF1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10240, [[TMP1]]
-; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF1:       vector.ph:
-; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10240, [[TMP3]]
-; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 10240, [[N_MOD_VF]]
-; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 25
-; CHECK-VF4UF1-NEXT:    [[IND_END:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[TMP4]]
-; CHECK-VF4UF1-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP6]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x double> poison, double [[J:%.*]], i32 [[TMP7]]
-; CHECK-VF4UF1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF1:       vector.body:
-; CHECK-VF4UF1-NEXT:    [[POINTER_PHI:%.*]] = phi double* [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 0, i32 0), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = mul i64 25, [[TMP10]]
-; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP9]], 0
-; CHECK-VF4UF1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i32 0
-; CHECK-VF4UF1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP13]]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP14]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 25, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = getelementptr double, double* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
-; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, <vscale x 4 x double*> [[TMP15]], i64 [[IDXPROM]]
-; CHECK-VF4UF1-NEXT:    [[WIDE_MASKED_GATHER]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[TMP16]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x double> undef)
-; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> [[VECTOR_RECUR]], <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x double> [[TMP17]], [[WIDE_MASKED_GATHER]]
-; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = fcmp une <vscale x 4 x double> [[TMP18]], zeroinitializer
-; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i1> [[TMP19]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT:    [[TMP21]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP20]]
-; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
-; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-VF4UF1-NEXT:    [[PTR_IND]] = getelementptr double, double* [[POINTER_PHI]], i64 [[TMP11]]
-; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF1-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4UF1:       middle.block:
-; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
-; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10240, [[N_VEC]]
-; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP27]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 [[TMP28]]
-; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 2
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 [[TMP31]]
-; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4UF1:       scalar.ph:
-; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF1:       for.cond.cleanup:
-; CHECK-VF4UF1-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    ret i32 [[A_1_LCSSA]]
-; CHECK-VF4UF1:       for.body:
-; CHECK-VF4UF1-NEXT:    [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP32:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; CHECK-VF4UF1-NEXT:    [[TMP32]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-VF4UF1-NEXT:    [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP32]]
-; CHECK-VF4UF1-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
-; CHECK-VF4UF1-NEXT:    [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
-; CHECK-VF4UF1-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
-; CHECK-VF4UF1-NEXT:    [[INC1]] = add nuw nsw i32 [[I_011]], 1
-; CHECK-VF4UF1-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 25
-; CHECK-VF4UF1-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
-; CHECK-VF4UF1-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; CHECK-VF4UF2-LABEL: @PR33613(
-; CHECK-VF4UF2-NEXT:  entry:
-; CHECK-VF4UF2-NEXT:    [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10240, [[TMP1]]
-; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF2:       vector.ph:
-; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10240, [[TMP3]]
-; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 10240, [[N_MOD_VF]]
-; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 25
-; CHECK-VF4UF2-NEXT:    [[IND_END:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[TMP4]]
-; CHECK-VF4UF2-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP6]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x double> poison, double [[J:%.*]], i32 [[TMP7]]
-; CHECK-VF4UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF2:       vector.body:
-; CHECK-VF4UF2-NEXT:    [[POINTER_PHI:%.*]] = phi double* [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 0, i32 0), [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = mul i64 25, [[TMP10]]
-; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP9]], 0
-; CHECK-VF4UF2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i32 0
-; CHECK-VF4UF2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP13]]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP14]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 25, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = getelementptr double, double* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
-; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-VF4UF2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i32 0
-; CHECK-VF4UF2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT4]], [[TMP17]]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_GEP5:%.*]] = mul <vscale x 4 x i64> [[TMP18]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 25, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr double, double* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP5]]
-; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, <vscale x 4 x double*> [[TMP15]], i64 [[IDXPROM]]
-; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, <vscale x 4 x double*> [[TMP19]], i64 [[IDXPROM]]
-; CHECK-VF4UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[TMP20]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x double> undef)
-; CHECK-VF4UF2-NEXT:    [[WIDE_MASKED_GATHER7]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[TMP21]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x double> undef)
-; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> [[VECTOR_RECUR]], <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], <vscale x 4 x double> [[WIDE_MASKED_GATHER7]], i32 -1)
-; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = fmul <vscale x 4 x double> [[TMP22]], [[WIDE_MASKED_GATHER]]
-; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = fmul <vscale x 4 x double> [[TMP23]], [[WIDE_MASKED_GATHER7]]
-; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = fcmp une <vscale x 4 x double> [[TMP24]], zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = fcmp une <vscale x 4 x double> [[TMP25]], zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = zext <vscale x 4 x i1> [[TMP26]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i1> [[TMP27]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP28]]
-; CHECK-VF4UF2-NEXT:    [[TMP31]] = add <vscale x 4 x i32> [[VEC_PHI6]], [[TMP29]]
-; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 8
-; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP33]]
-; CHECK-VF4UF2-NEXT:    [[PTR_IND]] = getelementptr double, double* [[POINTER_PHI]], i64 [[TMP11]]
-; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF2-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4UF2:       middle.block:
-; CHECK-VF4UF2-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP31]], [[TMP30]]
-; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10240, [[N_VEC]]
-; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER7]], i32 [[TMP38]]
-; CHECK-VF4UF2-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP40]], 2
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER7]], i32 [[TMP41]]
-; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4UF2:       scalar.ph:
-; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF2:       for.cond.cleanup:
-; CHECK-VF4UF2-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    ret i32 [[A_1_LCSSA]]
-; CHECK-VF4UF2:       for.body:
-; CHECK-VF4UF2-NEXT:    [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP42:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; CHECK-VF4UF2-NEXT:    [[TMP42]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-VF4UF2-NEXT:    [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP42]]
-; CHECK-VF4UF2-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
-; CHECK-VF4UF2-NEXT:    [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
-; CHECK-VF4UF2-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
-; CHECK-VF4UF2-NEXT:    [[INC1]] = add nuw nsw i32 [[I_011]], 1
-; CHECK-VF4UF2-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 25
-; CHECK-VF4UF2-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
-; CHECK-VF4UF2-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-VF4UF2-LABEL: @PR33613
+; CHECK-VF4UF2: vector.body
+; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x double> [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
+; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
+; CHECK-VF4UF2-NOT: insertelement <vscale x 4 x double>
+; CHECK-VF4UF2: middle.block
 entry:
   %idxprom = sext i32 %d to i64
   br label %for.body
@@ -249,248 +67,14 @@ for.body:
 ;
 ; Check that the sext sank after the load in the vector loop.
 define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) #0 {
-; CHECK-VF4UF1-LABEL: @PR34711(
-; CHECK-VF4UF1-NEXT:  entry:
-; CHECK-VF4UF1-NEXT:    [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8*
-; CHECK-VF4UF1-NEXT:    [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-VF4UF1-NEXT:    [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0
-; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2
-; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK-VF4UF1:       vector.memcheck:
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]]
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]]
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8*
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0
-; CHECK-VF4UF1-NEXT:    [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8*
-; CHECK-VF4UF1-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]]
-; CHECK-VF4UF1-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-VF4UF1-NEXT:    [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]]
-; CHECK-VF4UF1-NEXT:    [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]]
-; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
-; CHECK-VF4UF1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
-; CHECK-VF4UF1-NEXT:    [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]]
-; CHECK-VF4UF1-NEXT:    [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]]
-; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]]
-; CHECK-VF4UF1-NEXT:    [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]]
-; CHECK-VF4UF1-NEXT:    br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF1:       vector.ph:
-; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP8]]
-; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i64> [[TMP9]], zeroinitializer
-; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = mul i64 1, [[TMP13]]
-; CHECK-VF4UF1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i32 0
-; CHECK-VF4UF1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF1:       vector.body:
-; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP15]]
-; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], <vscale x 4 x i64> [[VEC_IND]], i64 1
-; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <vscale x 4 x i32>*
-; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP19]], align 4, !alias.scope !4, !noalias !7
-; CHECK-VF4UF1-NEXT:    [[WIDE_MASKED_GATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[TMP17]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef), !alias.scope !10
-; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[TMP20]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = mul nsw <vscale x 4 x i32> [[TMP22]], [[TMP21]]
-; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP15]]
-; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 0
-; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
-; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32>* [[TMP26]], align 4, !alias.scope !11, !noalias !10
-; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
-; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]]
-; CHECK-VF4UF1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF1-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-VF4UF1:       middle.block:
-; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP31]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 [[TMP32]]
-; CHECK-VF4UF1-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP33]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 2
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 [[TMP35]]
-; CHECK-VF4UF1-NEXT:    br label [[SCALAR_PH]]
-; CHECK-VF4UF1:       scalar.ph:
-; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-VF4UF1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF1:       for.body:
-; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP36:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[ARRAYCIDX:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF1-NEXT:    [[CUR_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDVARS_IV]], i64 1
-; CHECK-VF4UF1-NEXT:    store i32 7, i32* [[ARRAYCIDX]], align 4
-; CHECK-VF4UF1-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
-; CHECK-VF4UF1-NEXT:    [[TMP36]] = load i16, i16* [[CUR_INDEX]], align 2
-; CHECK-VF4UF1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP36]] to i32
-; CHECK-VF4UF1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; CHECK-VF4UF1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF1-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
-; CHECK-VF4UF1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-VF4UF1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-VF4UF1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-VF4UF1:       for.end:
-; CHECK-VF4UF1-NEXT:    ret void
-;
-; CHECK-VF4UF2-LABEL: @PR34711(
-; CHECK-VF4UF2-NEXT:  entry:
-; CHECK-VF4UF2-NEXT:    [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8*
-; CHECK-VF4UF2-NEXT:    [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-VF4UF2-NEXT:    [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0
-; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2
-; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK-VF4UF2:       vector.memcheck:
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]]
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]]
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8*
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0
-; CHECK-VF4UF2-NEXT:    [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8*
-; CHECK-VF4UF2-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]]
-; CHECK-VF4UF2-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-VF4UF2-NEXT:    [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]]
-; CHECK-VF4UF2-NEXT:    [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]]
-; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
-; CHECK-VF4UF2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
-; CHECK-VF4UF2-NEXT:    [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]]
-; CHECK-VF4UF2-NEXT:    [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]]
-; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]]
-; CHECK-VF4UF2-NEXT:    [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]]
-; CHECK-VF4UF2-NEXT:    br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF2:       vector.ph:
-; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
-; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP8]]
-; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i64> [[TMP9]], zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = mul i64 1, [[TMP13]]
-; CHECK-VF4UF2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i32 0
-; CHECK-VF4UF2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF2:       vector.body:
-; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 0
-; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 1
-; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], [[TMP19]]
-; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP15]]
-; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP20]]
-; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], <vscale x 4 x i64> [[VEC_IND]], i64 1
-; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], <vscale x 4 x i64> [[STEP_ADD]], i64 1
-; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
-; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP26]], align 4, !alias.scope !4, !noalias !7
-; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 [[TMP28]]
-; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP30]], align 4, !alias.scope !4, !noalias !7
-; CHECK-VF4UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[TMP23]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef), !alias.scope !10
-; CHECK-VF4UF2-NEXT:    [[WIDE_MASKED_GATHER18]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[TMP24]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef), !alias.scope !10
-; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_MASKED_GATHER]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]], i32 -1)
-; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i16> [[TMP31]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = sext <vscale x 4 x i16> [[TMP32]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul nsw <vscale x 4 x i32> [[TMP35]], [[TMP33]]
-; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP36]], [[TMP34]]
-; CHECK-VF4UF2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP15]]
-; CHECK-VF4UF2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP20]]
-; CHECK-VF4UF2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[TMP39]], i32 0
-; CHECK-VF4UF2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP37]], <vscale x 4 x i32>* [[TMP42]], align 4, !alias.scope !11, !noalias !10
-; CHECK-VF4UF2-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP44:%.*]] = mul i32 [[TMP43]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TMP39]], i32 [[TMP44]]
-; CHECK-VF4UF2-NEXT:    [[TMP46:%.*]] = bitcast i32* [[TMP45]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP38]], <vscale x 4 x i32>* [[TMP46]], align 4, !alias.scope !11, !noalias !10
-; CHECK-VF4UF2-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 8
-; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP48]]
-; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-VF4UF2-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF2-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-VF4UF2:       middle.block:
-; CHECK-VF4UF2-NEXT:    [[TMP50:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP51:%.*]] = mul i32 [[TMP50]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP52:%.*]] = sub i32 [[TMP51]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]], i32 [[TMP52]]
-; CHECK-VF4UF2-NEXT:    [[TMP53:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP54]], 2
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]], i32 [[TMP55]]
-; CHECK-VF4UF2-NEXT:    br label [[SCALAR_PH]]
-; CHECK-VF4UF2:       scalar.ph:
-; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-VF4UF2-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF2:       for.body:
-; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[ARRAYCIDX:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF2-NEXT:    [[CUR_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDVARS_IV]], i64 1
-; CHECK-VF4UF2-NEXT:    store i32 7, i32* [[ARRAYCIDX]], align 4
-; CHECK-VF4UF2-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
-; CHECK-VF4UF2-NEXT:    [[TMP56]] = load i16, i16* [[CUR_INDEX]], align 2
-; CHECK-VF4UF2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP56]] to i32
-; CHECK-VF4UF2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; CHECK-VF4UF2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF2-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
-; CHECK-VF4UF2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-VF4UF2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-VF4UF2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-VF4UF2:       for.end:
-; CHECK-VF4UF2-NEXT:    ret void
-;
+; CHECK-VF4UF1-LABEL: @PR34711
+; CHECK-VF4UF1: vector.body
+; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
+; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> {{.*}}, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT: mul nsw <vscale x 4 x i32> %[[SXT2]], %[[SXT1]]
 entry:
   %pre.index = getelementptr inbounds [2 x i16], [2 x i16]* %a, i64 0, i64 0
   %.pre = load i16, i16* %pre.index

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
index 2d1697b8e94f0..a40dafe6ec21e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 
@@ -17,70 +16,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NOT: x float>
 
 define void @_Z4testmm(i64 %size, i64 %offset) {
-; CHECK-LABEL: @_Z4testmm(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[R_057:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[G_056:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V_055:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[B_054:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[ADD]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
-; CHECK-NEXT:    [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
-; CHECK-NEXT:    [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
-; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
-; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
-; CHECK-NEXT:    [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
-; CHECK-NEXT:    [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX21]], align 4
-; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
-; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
-; CHECK-NEXT:    [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
-; CHECK-NEXT:    [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
-; CHECK-NEXT:    [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
-; CHECK-NEXT:    [[INC]] = add i64 [[V_055]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
-; CHECK-NEXT:    [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
-; CHECK-NEXT:    [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    store i8 [[R_0_LCSSA]], i8* @r_, align 1
-; CHECK-NEXT:    store i8 [[G_0_LCSSA]], i8* @g_, align 1
-; CHECK-NEXT:    store i8 [[B_0_LCSSA]], i8* @b_, align 1
-; CHECK-NEXT:    ret void
-;
 entry:
   %cmp53 = icmp eq i64 %size, 0
   br i1 %cmp53, label %for.end, label %for.body.lr.ph

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
index a50ffe531321c..d0e4796baf446 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
@@ -1,17 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mtriple=arm64-apple-darinw -S %s | FileCheck %s
 
 ; In the loop below, both the current and previous values of a first-order
 ; recurrence are stored in an interleave group.
 define void @interleaved_store_first_order_recurrence(i32* noalias %src, i32* %dst) {
 ; CHECK-LABEL: @interleaved_store_first_order_recurrence(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 99>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 99>, %vector.ph ], [ [[BROADCAST_SPLAT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
@@ -19,42 +14,17 @@ define void @interleaved_store_first_order_recurrence(i32* noalias %src, i32* %d
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 -2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i32> [[TMP10]], <12 x i32> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -2
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <12 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i32> [[TMP11]], <12 x i32> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 2
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 99, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_NEXT]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[OFF:%.*]] = mul nuw nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[OFF]]
-; CHECK-NEXT:    store i32 0, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[GEP_1]], i64 1
-; CHECK-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[GEP_1]], i64 2
-; CHECK-NEXT:    store i32 [[FOR_NEXT]], i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP12]], label %middle.block, label %vector.body
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 7e62812d67563..916a0ba96ebfa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -148,10 +148,10 @@ define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END17:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[CAST_CRD10:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END11:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD10]]
+; CHECK-NEXT:    [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -160,40 +160,40 @@ define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
 ; CHECK-NEXT:    [[N_VEC9:%.*]] = and i64 [[TMP2]], 8589934584
 ; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
-; CHECK-NEXT:    [[IND_END16:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT23]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
+; CHECK-NEXT:    [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP20:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX19]]
-; CHECK-NEXT:    [[NEXT_GEP21:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX19]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP20]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 2
-; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD22]], <8 x i8> [[BROADCAST_SPLAT24]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[NEXT_GEP21]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP13]], <8 x i8>* [[TMP14]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT25]] = add nuw i64 [[INDEX19]], 8
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC9]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]]
+; CHECK-NEXT:    [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX10]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC9]]
-; CHECK-NEXT:    br i1 [[CMP_N18]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC9]]
+; CHECK-NEXT:    br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8* [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT:    [[TMP17:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP16]], i8 [[OFFSET]])
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]])
 ; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT:    store i8 [[TMP17]], i8* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT:    store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index dff06cd3ba754..de82ae1b62f44 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize < %s 2>%t | FileCheck %s
 ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
@@ -13,30 +12,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; We should decide this loop is not worth vectorising using fixed width vectors
 define void @fixed_width(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @fixed_width(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I_07]]
-; CHECK-NEXT:    store i32 2, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-;
+; CHECK-NOT: vector.body
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
@@ -74,63 +50,8 @@ for.inc:                                          ; preds = %for.body, %if.then
 
 define void @scalable(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @scalable(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP10]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_07]]
-; CHECK-NEXT:    store i32 2, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
+; CHECK: vector.body
+; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll
index 90fa7dd4917bf..a9c1f6c5cd1da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll
@@ -1,21 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphones -force-vector-width=4 -force-vector-interleave=1 %s -S | FileCheck %s
 
 ; Vectors with i4 elements may not legal with nontemporal stores.
 define void @test_i4_store(i4* %ddst) {
-; CHECK-LABEL: @test_i4_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i4* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i4, i4* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i4 -6, i4* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i4_store(
+; CHECK-NOT:   vector.body:
+; CHECK:        ret void
 ;
 entry:
   br label %for.body
@@ -34,39 +23,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i8_store(i8* %ddst) {
-; CHECK-LABEL: @test_i8_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[DDST:%.*]], i64 1024
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> <i8 10, i8 10, i8 10, i8 10>, <4 x i8>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i8* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i8 10, i8* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i8_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <4 x i8> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -85,39 +45,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_half_store(half* %ddst) {
-; CHECK-LABEL: @test_half_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr half, half* [[DDST:%.*]], i64 1024
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr half, half* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, half* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[TMP1]] to <4 x half>*
-; CHECK-NEXT:    store <4 x half> <half 0xH4900, half 0xH4900, half 0xH4900, half 0xH4900>, <4 x half>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi half* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi half* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds half, half* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store half 0xH4900, half* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_half_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <4 x half> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -136,39 +67,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i16_store(i16* %ddst) {
-; CHECK-LABEL: @test_i16_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i16, i16* [[DDST:%.*]], i64 1024
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> <i16 10, i16 10, i16 10, i16 10>, <4 x i16>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i16* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i16 10, i16* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i16_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <4 x i16> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -187,47 +89,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i32_store(i32* nocapture %ddst) {
-; CHECK-LABEL: @test_i32_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[DDST:%.*]], i64 4096
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[DDST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 -3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> <i32 10, i32 20, i32 30, i32 40, i32 10, i32 20, i32 30, i32 40, i32 10, i32 20, i32 30, i32 40, i32 10, i32 20, i32 30, i32 40>, <16 x i32>* [[TMP4]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i32 10, i32* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 2
-; CHECK-NEXT:    store i32 20, i32* [[INCDEC_PTR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 3
-; CHECK-NEXT:    store i32 30, i32* [[INCDEC_PTR1]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 4
-; CHECK-NEXT:    store i32 40, i32* [[INCDEC_PTR2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i32_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <16 x i32> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -252,25 +117,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i33_store(i33* nocapture %ddst) {
-; CHECK-LABEL: @test_i33_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i33* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i33 10, i33* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 2
-; CHECK-NEXT:    store i33 20, i33* [[INCDEC_PTR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 3
-; CHECK-NEXT:    store i33 30, i33* [[INCDEC_PTR1]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 4
-; CHECK-NEXT:    store i33 40, i33* [[INCDEC_PTR2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 3
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i33_store(
+; CHECK-NOT:   vector.body:
+; CHECK:         ret
 ;
 entry:
   br label %for.body
@@ -295,25 +144,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i40_store(i40* nocapture %ddst) {
-; CHECK-LABEL: @test_i40_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i40* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i40 10, i40* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 2
-; CHECK-NEXT:    store i40 20, i40* [[INCDEC_PTR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 3
-; CHECK-NEXT:    store i40 30, i40* [[INCDEC_PTR1]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 4
-; CHECK-NEXT:    store i40 40, i40* [[INCDEC_PTR2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 3
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i40_store(
+; CHECK-NOT:   vector.body:
+; CHECK:         ret
 ;
 entry:
   br label %for.body
@@ -337,39 +170,10 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 }
 define void @test_i64_store(i64* nocapture %ddst) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_i64_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[DDST:%.*]], i64 1024
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, i64* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[TMP1]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> <i64 10, i64 10, i64 10, i64 10>, <4 x i64>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i64* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i64, i64* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i64 10, i64* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i64_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <4 x i64> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -388,39 +192,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_double_store(double* %ddst) {
-; CHECK-LABEL: @test_double_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr double, double* [[DDST:%.*]], i64 1024
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr double, double* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> <double 1.000000e+01, double 1.000000e+01, double 1.000000e+01, double 1.000000e+01>, <4 x double>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi double* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds double, double* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store double 1.000000e+01, double* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_double_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <4 x double> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -439,39 +214,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i128_store(i128* %ddst) {
-; CHECK-LABEL: @test_i128_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i128, i128* [[DDST:%.*]], i64 1024
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i128, i128* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i128, i128* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128* [[TMP1]] to <4 x i128>*
-; CHECK-NEXT:    store <4 x i128> <i128 10, i128 10, i128 10, i128 10>, <4 x i128>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i128* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i128* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i128, i128* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i128 10, i128* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i128_store(
+; CHECK-LABEL: vector.body:
+; CHECK:         store <4 x i128> {{.*}} !nontemporal !0
+; CHECK:         br
 ;
 entry:
   br label %for.body
@@ -490,19 +236,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 define void @test_i256_store(i256* %ddst) {
-; CHECK-LABEL: @test_i256_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DDST_ADDR:%.*]] = phi i256* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i256, i256* [[DDST_ADDR]], i64 1
-; CHECK-NEXT:    store i256 10, i256* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: define void @test_i256_store(
+; CHECK-NOT:   vector.body:
+; CHECK:        ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index 8d45f2010fbad..3e5c59681a004 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=aarch64 -loop-vectorize -force-vector-width=2 < %s | FileCheck %s
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -8,87 +7,7 @@ target triple = "aarch64--linux-gnu"
 
 define i32 @fn1() local_unnamed_addr #0 {
 ; We expect the backend to expand all reductions.
-; CHECK-LABEL: @fn1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @b, align 4, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    [[CMP40:%.*]] = icmp sgt i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[CMP40]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16*, i16** @a, align 8, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @b, align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP3]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <2 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP9]], align 2, !tbaa [[TBAA7:![0-9]+]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <2 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP11]], align 2, !tbaa [[TBAA7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i16> [[VEC_PHI2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt <2 x i16> [[VEC_PHI3]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP14]] = select <2 x i1> [[TMP12]], <2 x i16> [[VEC_PHI2]], <2 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15]] = select <2 x i1> [[TMP13]], <2 x i16> [[VEC_PHI3]], <2 x i16> [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp slt <2 x i16> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt <2 x i16> [[VEC_PHI1]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP18]] = select <2 x i1> [[TMP16]], <2 x i16> [[VEC_PHI]], <2 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP19]] = select <2 x i1> [[TMP17]], <2 x i16> [[VEC_PHI1]], <2 x i16> [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i16> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i16> [[TMP14]], <2 x i16> [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> [[RDX_MINMAX_SELECT6]])
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i16> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i16> [[TMP18]], <2 x i16> [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ undef, [[FOR_BODY_LR_PH]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX7:%.*]] = phi i16 [ undef, [[FOR_BODY_LR_PH]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[D_043:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSINK28:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C_042:%.*]] = phi i16 [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[C_0_:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !tbaa [[TBAA7]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i16 [[C_042]], [[TMP23]]
-; CHECK-NEXT:    [[C_0_]] = select i1 [[CMP2]], i16 [[C_042]], i16 [[TMP23]]
-; CHECK-NEXT:    [[CMP13:%.*]] = icmp slt i16 [[D_043]], [[TMP23]]
-; CHECK-NEXT:    [[DOTSINK28]] = select i1 [[CMP13]], i16 [[D_043]], i16 [[TMP23]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[C_0__LCSSA:%.*]] = phi i16 [ [[C_0_]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[DOTSINK28_LCSSA:%.*]] = phi i16 [ [[DOTSINK28]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[C_0_LCSSA:%.*]] = phi i16 [ undef, [[ENTRY:%.*]] ], [ [[C_0__LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[D_0_LCSSA:%.*]] = phi i16 [ undef, [[ENTRY]] ], [ [[DOTSINK28_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i16 [[C_0_LCSSA]], [[D_0_LCSSA]]
-; CHECK-NEXT:    [[CONV27:%.*]] = zext i1 [[CMP26]] to i32
-; CHECK-NEXT:    ret i32 [[CONV27]]
-;
+; CHECK: @llvm.vector.reduce
 entry:
   %0 = load i32, i32* @b, align 4, !tbaa !1
   %cmp40 = icmp sgt i32 %0, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
index 88ffc65d99968..ad41eb554a9bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-unknown-linux-gnu -force-vector-width=2 -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
 ; RUN: FileCheck %s --check-prefix=CHECK-REMARKS < %t
 
@@ -7,44 +6,7 @@
 ; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @alloca(i32** %vla, i64 %N) {
 ; CHECK-LABEL: @alloca(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** [[VLA:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32*, i32** [[VLA]], i64 [[INDUCTION1]]
-; CHECK-NEXT:    store i32* [[TMP0]], i32** [[TMP2]], align 8
-; CHECK-NEXT:    store i32* [[TMP1]], i32** [[TMP3]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32*, i32** [[VLA]], i64 [[IV]]
-; CHECK-NEXT:    store i32* [[ALLOCA]], i32** [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    call void @foo(i32** nonnull [[VLA]])
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <vscale x
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index f3e4113293728..7675a568051e6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu \
 ; RUN:     -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
 ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
@@ -7,65 +6,10 @@
 ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
 
 define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) {
-; CHECK-LABEL: @vec_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[N]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[TMP5]], align 8, !alias.scope !5
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <vscale x 2 x double> [[TMP6]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <vscale x 2 x double>*
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP7]], <vscale x 2 x double>* [[TMP9]], align 8, !alias.scope !8, !noalias !5
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = call double @foo(double [[TMP13]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP14]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store double [[ADD]], double* [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @vec_load
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
+; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> %[[LOAD]])
 entry:
   %cmp7 = icmp sgt i64 %N, 0
   br i1 %cmp7, label %for.body, label %for.end
@@ -87,53 +31,9 @@ for.end:                                 ; preds = %for.body, %entry
 }
 
 define void @vec_scalar(i64 %N, double* nocapture %a) {
-; CHECK-LABEL: @vec_scalar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+01, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <vscale x 2 x double> [[TMP4]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <vscale x 2 x double>*
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP5]], <vscale x 2 x double>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = call double @foo(double 1.000000e+01) #[[ATTR5]]
-; CHECK-NEXT:    [[SUB:%.*]] = fadd double [[TMP11]], -1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store double [[SUB]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @vec_scalar
+; CHECK: vector.body:
+; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+01, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
 entry:
   %cmp7 = icmp sgt i64 %N, 0
   br i1 %cmp7, label %for.body, label %for.end
@@ -153,56 +53,10 @@ for.end:                                 ; preds = %for.body, %entry
 }
 
 define void @vec_ptr(i64 %N, i64* noalias %a, i64** readnone %b) {
-; CHECK-LABEL: @vec_ptr(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64*, i64** [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64** [[TMP4]] to <vscale x 2 x i64*>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64*>, <vscale x 2 x i64*>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64*, i64** [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64*, i64** [[GEP]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i64 @bar(i64* [[LOAD]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALL]], i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @vec_ptr
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x i64*>, <vscale x 2 x i64*>*
+; CHECK: call <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*> %[[LOAD]])
 entry:
   %cmp7 = icmp sgt i64 %N, 0
   br i1 %cmp7, label %for.body, label %for.end
@@ -223,56 +77,10 @@ for.end:
 }
 
 define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
-; CHECK-LABEL: @vec_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <vscale x 2 x double> [[TMP6]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP4]] to <vscale x 2 x double>*
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP7]], <vscale x 2 x double>* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast double @llvm.sin.f64(double [[TMP12]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[TMP13]], 1.000000e+00
-; CHECK-NEXT:    store double [[ADD]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @vec_intrinsic
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
+; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]])
 entry:
   %cmp7 = icmp sgt i64 %N, 0
   br i1 %cmp7, label %for.body, label %for.end
@@ -297,44 +105,9 @@ for.end:
 ; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
 ; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
-; CHECK-LABEL: @vec_sin_no_mapping(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <2 x float>*, !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4, !dbg [[DBG20]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[WIDE_LOAD]]), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP4]], align 4, !dbg [[DBG24]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg [[DBG20]]
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[TMP6]]), !dbg [[DBG23]]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store float [[TMP7]], float* [[ARRAYIDX1]], align 4, !dbg [[DBG24]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
+; CHECK: @vec_sin_no_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
 entry:
   br label %for.body
 
@@ -359,6 +132,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
 ; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping_ite(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_no_mapping_ite
+; CHECK-NOT: <vscale x
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -391,6 +167,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
 ; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_fixed_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
 entry:
   br label %for.body
 
@@ -413,6 +192,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; in the loop below we can still vectorize the loop because SVE has
 ; hardware support in the form of the 'fqsrt' instruction.
 define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 {
+; CHECK: @vec_sqrt_no_mapping
+; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
index bb4b6dfce5788..e9b95a2051c54 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
 
@@ -17,6 +16,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Future implementation of llvm.vp could allow this to happen
 
 define void  @predication_in_loop(i32* %a, i32* %b, i32* %cond) #0 {
+; CHECK-LABEL: @predication_in_loop
+; CHECK-NOT:  sdiv <vscale x 4 x i32>
+;
 entry:
   br label %for.body
 
@@ -59,64 +61,8 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; "Max legal vector width too small, scalable vectorization unfeasible.."
 
 define void @unpredicated_loop_predication_through_tailfolding(i32* %a, i32* %b) #0 {
-; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1032
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 1024
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !11, !noalias !14
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !14
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP11]], align 4, !alias.scope !11, !noalias !14
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[SDIV:%.*]] = sdiv i32 [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[IV]], 8
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    store i32 [[SDIV]], i32* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding
+; CHECK-NOT:  sdiv <vscale x 4 x i32>
 
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index fabbe0b73e081..feafd8831b668 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK
 ; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK
 
@@ -8,67 +7,15 @@
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @add(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = add <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @add
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ADD1:.*]] = add <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = add <vscale x 8 x i32> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %[[ADD]])
 entry:
   br label %for.body
 
@@ -90,67 +37,15 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @or(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = or <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = or <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[OR]] = or i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
-;
+; CHECK-LABEL: @or
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[OR1:.*]] = or <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[OR2:.*]] = or <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[OR:.*]] = or <vscale x 8 x i32> %[[OR2]], %[[OR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> %[[OR]])
 entry:
   br label %for.body
 
@@ -172,67 +67,15 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @and(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = and <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = and <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND]] = and i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[AND_LCSSA]]
-;
+; CHECK-LABEL: @and
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[AND1:.*]] = and <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[AND2:.*]] = and <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ABD:.*]] = and <vscale x 8 x i32> %[[ADD2]], %[[AND1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %[[ADD]])
 entry:
   br label %for.body
 
@@ -254,67 +97,15 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @xor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = xor <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = xor <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[XOR]] = xor i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
-;
+; CHECK-LABEL: @xor
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[XOR1:.*]] = xor <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[XOR2:.*]] = xor <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[XOR:.*]] = xor <vscale x 8 x i32> %[[XOR2]], %[[XOR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %[[XOR]])
 entry:
   br label %for.body
 
@@ -336,71 +127,18 @@ for.end:                                 ; preds = %for.body, %entry
 ; SMIN
 
 define i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @smin(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @smin
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
 entry:
   br label %for.body
 
@@ -423,71 +161,18 @@ for.end:
 ; UMAX
 
 define i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @umax(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @umax
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
 entry:
   br label %for.body
 
@@ -510,67 +195,15 @@ for.end:
 ; FADD (FAST)
 
 define float @fadd_fast(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> zeroinitializer, float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <vscale x 8 x float> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[ADD1:.*]] = fadd fast <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
 entry:
   br label %for.body
 
@@ -591,55 +224,15 @@ for.end:
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast_bfloat(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds bfloat, bfloat* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast bfloat* [[TMP4]] to <8 x bfloat>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 8
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast bfloat* [[TMP6]] to <8 x bfloat>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = fadd fast <8 x bfloat> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9]] = fadd fast <8 x bfloat> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x bfloat> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load bfloat, bfloat* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast bfloat [[TMP12]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi bfloat [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret bfloat [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast_bfloat
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <8 x bfloat>
+; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
 entry:
   br label %for.body
 
@@ -661,71 +254,18 @@ for.end:
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define float @fmin_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmin_fast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp olt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %[[SEL]])
 entry:
   br label %for.body
 
@@ -748,71 +288,18 @@ for.end:
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define float @fmax_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmax_fast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select fast <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast ogt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %[[SEL]])
 entry:
   br label %for.body
 
@@ -835,24 +322,8 @@ for.end:
 
 ; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop
 define void @invariant_store(i32* %dst, i32* readonly %src) {
-; CHECK-LABEL: @invariant_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 42
-; CHECK-NEXT:    store i32 0, i32* [[GEP_DST]], align 4
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_DST]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @invariant_store
+; CHECK-NOT: vector.body
 entry:
   %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
   store i32 0, i32* %gep.dst, align 4
@@ -879,55 +350,15 @@ for.cond.cleanup:
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
 define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = mul <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9]] = mul <4 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[TMP12]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @mul
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
 entry:
   br label %for.body
 
@@ -949,81 +380,19 @@ for.end:                                 ; preds = %for.body, %entry
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
 define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @memory_dependence(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 4
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP24]] = mul <4 x i32> [[WIDE_LOAD3]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP25]] = mul <4 x i32> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP25]], [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i64 [[I]], 32
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD2]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[TMP29]], [[SUM]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @memory_dependence
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[LOAD3:.*]] = load <4 x i32>
+; CHECK: %[[LOAD4:.*]] = load <4 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 61109876f123d..1aabd077926e2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true  -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
@@ -6,117 +5,32 @@
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
 
 define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP13]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP12]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI]], <vscale x 8 x float> %[[LOAD]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <vscale x 8 x float> %[[LOAD_VEC]], %[[VEC_PHI]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[FADD_VEC]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -136,195 +50,51 @@ for.end:
 }
 
 define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP38]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP39]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP40]] = fadd <vscale x 8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[TMP41]] = fadd <vscale x 8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 32
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP43]]
-; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP39]], [[TMP38]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <vscale x 8 x float> [[TMP40]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <vscale x 8 x float> [[TMP41]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP46]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP38]], <vscale x 8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP39]], <vscale x 8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT:    [[TMP41]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], <vscale x 8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 32
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP43]]
-; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP45]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_unroll
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI1]], <vscale x 8 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX1]], <vscale x 8 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX2]], <vscale x 8 x float> %[[LOAD3]])
+; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX3]], <vscale x 8 x float> %[[LOAD4]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_unroll
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <vscale x 8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <vscale x 8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]]
+; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <vscale x 8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <vscale x 8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <vscale x 8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <vscale x 8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[BIN_RDX3]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -344,189 +114,70 @@ for.end:
 }
 
 define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    ret void
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT:    [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-UNORDERED-NEXT:    [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A2]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i64> [[TMP9]], zeroinitializer
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = mul i64 2, [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i32 0
-; CHECK-UNORDERED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x float> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-UNORDERED-NEXT:    [[TMP16]] = fadd <vscale x 4 x float> [[WIDE_MASKED_GATHER]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = or <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP17]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP18]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-UNORDERED-NEXT:    [[TMP19]] = fadd <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-UNORDERED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP16]])
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP19]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD1]] = fadd float [[TMP25]], [[ADD_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD2]] = fadd float [[TMP26]], [[ADD_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-UNORDERED-NEXT:    store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT:    ret void
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT:    [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-ORDERED-NEXT:    [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul i64 2, [[TMP11]]
-; CHECK-ORDERED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i32 0
-; CHECK-ORDERED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-ORDERED-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[WIDE_MASKED_GATHER]])
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = or <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP16]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-ORDERED-NEXT:    [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_MASKED_GATHER2]])
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD1]] = fadd float [[TMP21]], [[ADD_PHI2]]
-; CHECK-ORDERED-NEXT:    [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD2]] = fadd float [[TMP22]], [[ADD_PHI1]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-ORDERED-NEXT:    store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT:    ret void
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_interleave
+; CHECK-ORDERED: entry
+; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* %a
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[STEPVEC1:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-ORDERED: %[[STEPVEC_ADD1:.*]] = add <vscale x 4 x i64> %[[STEPVEC1]], zeroinitializer
+; CHECK-ORDERED: %[[STEPVEC_MUL:.*]] = mul <vscale x 4 x i64> %[[STEPVEC_ADD1]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED: %[[INDUCTION:.*]] = add <vscale x 4 x i64> zeroinitializer, %[[STEPVEC_MUL]]
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK-ORDERED: %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %[[INDUCTION]], %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-ORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[VEC_IND]]
+; CHECK-ORDERED: %[[MGATHER1:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP1]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], <vscale x 4 x float> %[[MGATHER1]])
+; CHECK-ORDERED: %[[OR:.*]] = or <vscale x 4 x i64> %[[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[OR]]
+; CHECK-ORDERED: %[[MGATHER2:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP2]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], <vscale x 4 x float> %[[MGATHER2]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: ret void
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_interleave
+; CHECK-UNORDERED: entry
+; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* %a
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INS_ELT2:.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float %[[LOAD2]], i32 0
+; CHECK-UNORDERED: %[[INS_ELT1:.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float %[[LOAD1]], i32 0
+; CHECK-UNORDERED: %[[STEPVEC1:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-UNORDERED: %[[STEPVEC_ADD1:.*]] = add <vscale x 4 x i64> %[[STEPVEC1]], zeroinitializer
+; CHECK-UNORDERED: %[[STEPVEC_MUL:.*]] = mul <vscale x 4 x i64> %[[STEPVEC_ADD1]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED: %[[INDUCTION:.*]] = add <vscale x 4 x i64> zeroinitializer, %[[STEPVEC_MUL]]
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <vscale x 4 x float> [ %[[INS_ELT2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <vscale x 4 x float> [ %[[INS_ELT1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64>
+; CHECK-UNORDERED: %[[MGATHER1:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP1]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-UNORDERED:  %[[VEC_FADD1]] = fadd <vscale x 4 x float> %[[MGATHER1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[OR:.*]] = or <vscale x 4 x i64> {{.*}}, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[OR]]
+; CHECK-UNORDERED: %[[MGATHER2:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP2]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 4 x float> %[[MGATHER2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[VEC_RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD1]])
+; CHECK-UNORDERED: %[[VEC_RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD3]], {{.*}}
+; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD4]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RDX1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[VEC_RDX1]], %middle.block ]
+; CHECK-UNORDERED: %[[RDX2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[VEC_RDX2]], %middle.block ]
+; CHECK-UNORDERED: store float %[[RDX1]], float* %a
+; CHECK-UNORDERED: store float %[[RDX2]], float* {{.*}}
+; CHECK-UNORDERED: ret void
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   %arrayidxa = getelementptr inbounds float, float* %a, i64 1
@@ -556,160 +207,42 @@ for.end:
 }
 
 define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body.preheader:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED:       for.end.loopexit:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_END]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RES]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_of_sum(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-UNORDERED:       for.body.preheader:
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP2]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP13]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP18]], [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-UNORDERED:       for.end.loopexit:
-; CHECK-UNORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_END]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RES]]
-;
-; CHECK-ORDERED-LABEL: @fadd_of_sum(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-ORDERED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-ORDERED:       for.body.preheader:
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP2]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-ORDERED-NEXT:    [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP12]])
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-ORDERED:       for.end.loopexit:
-; CHECK-ORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_END]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RES]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_of_sum
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-ORDERED: %[[ADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], <vscale x 4 x float> %[[ADD]])
+; CHECK-ORDERED: for.end.loopexit
+; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ {{.*}}, %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_of_sum
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <vscale x 4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 4 x float> %[[VEC_PHI]], %[[VEC_FADD1]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]]
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]]
+; CHECK-UNORDERED: for.end.loopexit
+; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
+; CHECK-UNORDERED: ret float %[[SUM]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   %arrayidx = getelementptr inbounds float, float* %a, i64 1
@@ -736,155 +269,50 @@ for.end:                                 ; preds = %for.body, %entry
 }
 
 define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED:       if.then:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED:       for.inc:
-; CHECK-NOT-VECTORIZED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_conditional(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP13]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 0.000000e+00
-; CHECK-UNORDERED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-UNORDERED:       if.then:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    br label [[FOR_INC]]
-; CHECK-UNORDERED:       for.inc:
-; CHECK-UNORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP19]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_conditional(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
-; CHECK-ORDERED-NEXT:    [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 0.000000e+00
-; CHECK-ORDERED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-ORDERED:       if.then:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    br label [[FOR_INC]]
-; CHECK-ORDERED:       for.inc:
-; CHECK-ORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP18]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_conditional
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-ORDERED: %[[FCMP:.*]] = fcmp une <vscale x 4 x float> %[[LOAD]], zeroinitializer
+; CHECK-ORDERED: %[[MASKED_LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* {{.*}}, i32 4, <vscale x 4 x i1> %[[FCMP]], <vscale x 4 x float> poison)
+; CHECK-ORDERED: %[[XOR:.*]] = xor <vscale x 4 x i1> %[[FCMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[XOR]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> %[[MASKED_LOAD]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI]], <vscale x 4 x float> %[[SELECT]])
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 1.000000e+00, %entry ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RES:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-ORDERED: if.then
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: for.inc
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[LOAD2]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES]], %[[PHI]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_conditional
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 1.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-UNORDERED: %[[FCMP:.*]] = fcmp une <vscale x 4 x float> %[[LOAD1]], zeroinitializer
+; CHECK-UNORDERED: %[[MASKED_LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* {{.*}}, i32 4, <vscale x 4 x i1> %[[FCMP]], <vscale x 4 x float> poison)
+; CHECK-UNORDERED: %[[XOR:.*]] = xor <vscale x 4 x i1> %[[FCMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[XOR]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> %[[MASKED_LOAD]]
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <vscale x 4 x float> %[[VEC_PHI]], %[[SELECT]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-UNORDERED: for.inc
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -916,101 +344,30 @@ for.end:
 
 ; Negative test - loop contains multiple fadds which we cannot safely reorder
 define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = fadd <vscale x 8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP12]] = fadd <vscale x 8 x float> [[TMP8]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP12]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP17]]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_multiple
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <vscale x 8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[SUM]], %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1035,249 +392,64 @@ for.end:                                         ; preds = %for.body
 
 ; Test case where loop has a call to the llvm.fmuladd intrinsic.
 define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_strict(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-UNORDERED-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-UNORDERED-NEXT:    [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP56]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP57]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP58]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP59]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 32
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP61]]
-; CHECK-UNORDERED-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP62]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP57]], [[TMP56]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <vscale x 8 x float> [[TMP58]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <vscale x 8 x float> [[TMP59]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT:    [[TMP63:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP64:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP65:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_strict(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-ORDERED-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-ORDERED-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-ORDERED-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-ORDERED-NEXT:    [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP56:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP57:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP58:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP59:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP60:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP56]])
-; CHECK-ORDERED-NEXT:    [[TMP61:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], <vscale x 8 x float> [[TMP57]])
-; CHECK-ORDERED-NEXT:    [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], <vscale x 8 x float> [[TMP58]])
-; CHECK-ORDERED-NEXT:    [[TMP63]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], <vscale x 8 x float> [[TMP59]])
-; CHECK-ORDERED-NEXT:    [[TMP64:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP64]], 32
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP65]]
-; CHECK-ORDERED-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP67:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP68:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1300,249 +472,64 @@ for.end:
 
 ; Same as above but where the call to the llvm.fmuladd intrinsic has a fast-math flag.
 define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-UNORDERED-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT:    [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-UNORDERED-NEXT:    [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP56]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP57]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP58]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP59]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 32
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP61]]
-; CHECK-UNORDERED-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP62]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[TMP57]], [[TMP56]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd nnan <vscale x 8 x float> [[TMP58]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd nnan <vscale x 8 x float> [[TMP59]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT:    [[TMP63:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP64:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP65:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-ORDERED-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-ORDERED-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT:    [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-ORDERED-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-ORDERED-NEXT:    [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP56:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP57:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP58:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP59:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP60:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP56]])
-; CHECK-ORDERED-NEXT:    [[TMP61:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], <vscale x 8 x float> [[TMP57]])
-; CHECK-ORDERED-NEXT:    [[TMP62:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], <vscale x 8 x float> [[TMP58]])
-; CHECK-ORDERED-NEXT:    [[TMP63]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], <vscale x 8 x float> [[TMP59]])
-; CHECK-ORDERED-NEXT:    [[TMP64:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP64]], 32
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP65]]
-; CHECK-ORDERED-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP67:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP68:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
index 3e90c563438de..cf0dbb30d0d37 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s
 ; RUN: opt -mattr=+sve -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s
 
@@ -17,56 +16,17 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @foo(i32* %data1, i32* %data2) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[DATA1:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[DATA1]], i64 [[INDUCTION1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:         br i1 {{%.*}}, label %pred.store.if, label %pred.store.continue
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK-NEXT:    store i32 {{%.*}}, i32* {{%.*}}
+; CHECK-NEXT:    br label %pred.store.continue
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; CHECK-NEXT:    br i1 {{%.*}}, label %pred.store.if2, label %pred.store.continue3
 ; CHECK:       pred.store.if2:
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP1]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; CHECK-NEXT:    store i32 {{%.*}}, i32* {{%.*}}
+; CHECK-NEXT:    br label %pred.store.continue3
 ; CHECK:       pred.store.continue3:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DATA1]], i64 [[I]]
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[LD]], [[LD]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i32 [[LD]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I_NEXT]] = add nsw i64 [[I]], -1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       while.end:
-; CHECK-NEXT:    ret void
-;
 
 entry:
   br label %while.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index 57d9c59b15b28..20d2dc0b7cdae 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
 
@@ -6,91 +5,17 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios5.0.0"
 
 define void @selects_1(i32* nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
-; CHECK-LABEL: @selects_1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i32> [[WIDE_LOAD]], <i32 2047, i32 2047, i32 2047, i32 2047>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i32> [[WIDE_LOAD1]], <i32 2047, i32 2047, i32 2047, i32 2047>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i32> [[TMP8]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <4 x i32> [[TMP9]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 30, i32 30, i32 30, i32 30>, <4 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 30, i32 30, i32 30, i32 30>, <4 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ugt <4 x i32> [[TMP12]], [[BROADCAST_SPLAT9]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt <4 x i32> [[TMP13]], [[BROADCAST_SPLAT11]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP12]], <4 x i32> [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[TMP13]], <4 x i32> [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP25]], 2047
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[AND]], [[A]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], i32 10, i32 [[AND]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[AND]], [[B]]
-; CHECK-NEXT:    [[COND6:%.*]] = select i1 [[CMP2]], i32 30, i32 [[AND]]
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp ugt i32 [[COND]], [[C]]
-; CHECK-NEXT:    [[COND11:%.*]] = select i1 [[CMP7]], i32 [[COND]], i32 [[COND6]]
-; CHECK-NEXT:    store i32 [[COND11]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
 
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
 
+; CHECK-LABEL: define void @selects_1(
+; CHECK:       vector.body:
+; CHECK:         select <4 x i1>
 
 entry:
   %cmp26 = icmp sgt i32 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
index 78955ab51a20f..e98030213a119 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -loop-vectorize -force-ordered-reductions=true -force-vector-width=1 -S < %s -debug 2> %t.debug | FileCheck %s
 ; RUN: cat %t.debug | FileCheck %s --check-prefix=CHECK-DEBUG
@@ -9,32 +8,7 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
-; CHECK:       for.body.us:
-; CHECK-NEXT:    [[I_023_US:%.*]] = phi i64 [ [[INC8_US:%.*]], [[FOR_COND3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[I_023_US]]
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i64 [[I_023_US]], [[N:%.*]]
-; CHECK-NEXT:    br label [[FOR_BODY3_US:%.*]]
-; CHECK:       for.body3.us:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_US]] ], [ [[ADD6_US:%.*]], [[FOR_BODY3_US]] ]
-; CHECK-NEXT:    [[J_021_US:%.*]] = phi i64 [ 0, [[FOR_BODY_US]] ], [ [[INC_US:%.*]], [[FOR_BODY3_US]] ]
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i64 [[J_021_US]], [[MUL_US]]
-; CHECK-NEXT:    [[ARRAYIDX4_US:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[ADD_US]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX4_US]], align 4
-; CHECK-NEXT:    [[ADD6_US]] = fadd float [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[INC_US]] = add nuw nsw i64 [[J_021_US]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC_US]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND3]], label [[FOR_BODY3_US]]
-; CHECK:       for.cond3:
-; CHECK-NEXT:    [[ADD6_US_LCSSA:%.*]] = phi float [ [[ADD6_US]], [[FOR_BODY3_US]] ]
-; CHECK-NEXT:    store float [[ADD6_US_LCSSA]], float* [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[INC8_US]] = add nuw nsw i64 [[I_023_US]], 1
-; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i64 [[INC8_US]], [[M:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_US]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: vector.body
 
 entry:
   br label %for.body.us

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 3eb8c2f6718f1..a0dba427ce3a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=true  -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
@@ -6,105 +5,32 @@
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
 
 define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP4]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP7]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP4]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP6]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[LOAD_VEC]], %[[VEC_PHI]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -125,105 +51,32 @@ for.end:
 
 ; Same as above but where fadd has a fast-math flag.
 define float @fadd_strict_fmf(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_fmf(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP4]] = fadd nnan <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd nnan float [[TMP7]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_fmf(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP4]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd nnan float [[TMP6]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[RDX]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[LOAD_VEC]])
+; CHECK-ORDERED: for.end:
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_fmf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FADD_VEC:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FADD_VEC]] = fadd nnan <8 x float> [[LOAD_VEC]], [[VEC_PHI]]
+; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd 
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[FADD_VEC]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FADD:%.*]] = fadd nnan float [[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -243,147 +96,51 @@ for.end:
 }
 
 define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP16]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP17]] = fadd <8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP18]] = fadd <8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[TMP19]] = fadd <8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP17]], [[TMP16]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP18]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP19]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP22]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP16]], <8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP17]], <8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP18]], <8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP21]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_unroll
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_unroll
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED:  %[[VEC_PHI1:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED:  %[[VEC_PHI2:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED:  %[[VEC_PHI3:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED:  %[[VEC_PHI4:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]]
+; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -411,171 +168,63 @@ for.end:
 ; return sum;
 
 define float @fadd_strict_unroll_last_val(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body.preheader:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[FADD]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.cond.cleanup:
-; CHECK-NOT-VECTORIZED-NEXT:    [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[FADD2]], float* [[B:%.*]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_END]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[SUM_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-UNORDERED:       for.body.preheader:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP16]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP17]] = fadd <8 x float> [[VEC_PHI1]], [[WIDE_LOAD4]]
-; CHECK-UNORDERED-NEXT:    [[TMP18]] = fadd <8 x float> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; CHECK-UNORDERED-NEXT:    [[TMP19]] = fadd <8 x float> [[VEC_PHI3]], [[WIDE_LOAD6]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP17]], [[TMP16]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP18]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP19]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[FADD]] = fadd float [[SUM]], [[TMP22]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-UNORDERED:       for.cond.cleanup:
-; CHECK-UNORDERED-NEXT:    [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01
-; CHECK-UNORDERED-NEXT:    store float [[FADD2]], float* [[B:%.*]], align 4
-; CHECK-UNORDERED-NEXT:    br label [[FOR_END]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[SUM_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-ORDERED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-ORDERED:       for.body.preheader:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP16]], <8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP17]], <8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP18]], <8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[FADD]] = fadd float [[SUM]], [[TMP21]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-ORDERED:       for.cond.cleanup:
-; CHECK-ORDERED-NEXT:    [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01
-; CHECK-ORDERED-NEXT:    store float [[FADD2]], float* [[B:%.*]], align 4
-; CHECK-ORDERED-NEXT:    br label [[FOR_END]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    ret float [[SUM_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ {{.*}}, %scalar.ph ]
+; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD]] =  fadd float %[[SUM_PHI]], %[[LOAD5]]
+; CHECK-ORDERED: for.cond.cleanup
+; CHECK-ORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK-ORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
+; CHECK-ORDERED: store float %[[FADD_42]], float* %b
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
+; CHECK-ORDERED: ret float %[[SUM_LCSSA]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_PHI1]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_PHI2]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_PHI3]], %[[VEC_LOAD3]]
+; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_PHI4]], %[[VEC_LOAD4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]]
+; CHECK-UNORDERED: for.cond.cleanup
+; CHECK-UNORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
+; CHECK-UNORDERED: store float %[[FADD_42]], float* %b
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
+; CHECK-UNORDERED: ret float %[[SUM_LCSSA]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   %cmp = icmp sgt i64 %n, 0
@@ -603,161 +252,55 @@ for.end:
 }
 
 define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    ret void
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT:    [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-UNORDERED-NEXT:    [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float [[A2]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float [[A1]], i32 0
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd <4 x float> [[STRIDED_VEC]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP10]] = fadd <4 x float> [[STRIDED_VEC2]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP9]])
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP10]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD1]] = fadd float [[TMP14]], [[ADD_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD2]] = fadd float [[TMP15]], [[ADD_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-UNORDERED-NEXT:    store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT:    ret void
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT:    [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-ORDERED-NEXT:    [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
-; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-ORDERED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-ORDERED-NEXT:    [[TMP7]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI1]], <4 x float> [[STRIDED_VEC]])
-; CHECK-ORDERED-NEXT:    [[TMP8]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[STRIDED_VEC2]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD1]] = fadd float [[TMP10]], [[ADD_PHI2]]
-; CHECK-ORDERED-NEXT:    [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD2]] = fadd float [[TMP11]], [[ADD_PHI1]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-ORDERED-NEXT:    store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT:    ret void
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_interleave
+; CHECK-ORDERED: entry
+; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* %a
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK-ORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-ORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]])
+; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: ret void
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_interleave
+; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-UNORDERED: %[[LOADA1:.*]] = load float, float* %a
+; CHECK-UNORDERED: %[[LOADA2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INS2:.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float %[[LOADA2]], i32 0
+; CHECK-UNORDERED: %[[INS1:.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float %[[LOADA1]], i32 0
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <4 x float> [ %[[INS2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <4 x float> [ %[[INS1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-UNORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <4 x float> %[[STRIDED1:.*]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
+; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}}
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD2]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[RDX1]], %middle.block ]
+; CHECK-UNORDERED: %[[SUM2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX2]], %middle.block ]
+; CHECK-UNORDERED: store float %[[SUM1]]
+; CHECK-UNORDERED: store float %[[SUM2]]
+; CHECK-UNORDERED: ret void
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   %arrayidxa = getelementptr inbounds float, float* %a, i64 1
@@ -787,148 +330,42 @@ for.end:
 }
 
 define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body.preheader:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED:       for.end.loopexit:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_END]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RES]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_of_sum(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-UNORDERED:       for.body.preheader:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd <4 x float> [[VEC_PHI]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP9]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP12]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-UNORDERED:       for.end.loopexit:
-; CHECK-UNORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_END]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RES]]
-;
-; CHECK-ORDERED-LABEL: @fadd_of_sum(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-ORDERED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-ORDERED:       for.body.preheader:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-ORDERED-NEXT:    [[TMP9]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP8]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP11]], [[TMP12]]
-; CHECK-ORDERED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-ORDERED:       for.end.loopexit:
-; CHECK-ORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_END]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RES]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_of_sum
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]])
+; CHECK-ORDERED: for.end.loopexit
+; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_of_sum
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[VEC_PHI]], %[[VEC_FADD1]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]]
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]]
+; CHECK-UNORDERED: for.end.loopexit
+; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
+; CHECK-UNORDERED: ret float %[[SUM]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   %arrayidx = getelementptr inbounds float, float* %a, i64 1
@@ -955,213 +392,63 @@ for.end:                                 ; preds = %for.body, %entry
 }
 
 define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED:       if.then:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED:       for.inc:
-; CHECK-NOT-VECTORIZED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_conditional(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-UNORDERED:       pred.load.if:
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0
-; CHECK-UNORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK-UNORDERED:       pred.load.continue:
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK-UNORDERED:       pred.load.if1:
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1
-; CHECK-UNORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-UNORDERED:       pred.load.continue2:
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK-UNORDERED:       pred.load.if3:
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2
-; CHECK-UNORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK-UNORDERED:       pred.load.continue4:
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK-UNORDERED:       pred.load.if5:
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3
-; CHECK-UNORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK-UNORDERED:       pred.load.continue6:
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP28]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> [[TMP27]]
-; CHECK-UNORDERED-NEXT:    [[TMP29]] = fadd <4 x float> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP29]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP32]], 0.000000e+00
-; CHECK-UNORDERED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-UNORDERED:       if.then:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    br label [[FOR_INC]]
-; CHECK-UNORDERED:       for.inc:
-; CHECK-UNORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP33]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_conditional(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-ORDERED:       pred.load.if:
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0
-; CHECK-ORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK-ORDERED:       pred.load.continue:
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-ORDERED-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK-ORDERED:       pred.load.if1:
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1
-; CHECK-ORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-ORDERED:       pred.load.continue2:
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-ORDERED-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK-ORDERED:       pred.load.if3:
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2
-; CHECK-ORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK-ORDERED:       pred.load.continue4:
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-ORDERED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK-ORDERED:       pred.load.if5:
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3
-; CHECK-ORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK-ORDERED:       pred.load.continue6:
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP28]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> [[TMP27]]
-; CHECK-ORDERED-NEXT:    [[TMP29]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[PREDPHI]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP31]], 0.000000e+00
-; CHECK-ORDERED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-ORDERED:       if.then:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    br label [[FOR_INC]]
-; CHECK-ORDERED:       for.inc:
-; CHECK-ORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP32]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_conditional
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK-ORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK-ORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK-ORDERED: pred.load.continue6
+; CHECK-ORDERED: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
+; CHECK-ORDERED: %[[XOR:.*]] =  xor <4 x i1> %[[FCMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> %[[PHI1]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK-ORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK-ORDERED: if.then
+; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-ORDERED: br label %for.inc
+; CHECK-ORDERED: for.inc
+; CHECK-ORDERED: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_conditional
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <4 x float> [ <float 1.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %pred.load.continue6 ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK-UNORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK-UNORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK-UNORDERED: pred.load.continue6
+; CHECK-UNORDERED: %[[XOR:.*]] =  xor <4 x i1> %[[FCMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> %[[PRED_PHI:.*]]
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[PHI]], %[[PRED]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK-UNORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK-UNORDERED: if.then
+; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-UNORDERED: for.inc
+; CHECK-UNORDERED: %[[PHI:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1193,147 +480,44 @@ for.end:
 
 ; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute
 define float @fadd_predicated(float* noalias nocapture %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[L2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[L3:%.*]] = load float, float* [[L2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[L7]] = fadd float [[SUM_02]], [[L3]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[SUM_0_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_predicated(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-UNORDERED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-UNORDERED:       pred.load.if:
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK-UNORDERED:       pred.load.continue:
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK-UNORDERED:       pred.load.if1:
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1
-; CHECK-UNORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-UNORDERED:       pred.load.continue2:
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP13]] = fadd <2 x float> [[VEC_PHI]], [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP13]], <2 x float> [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[TMP14]])
-; CHECK-UNORDERED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT:    [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[L3:%.*]] = load float, float* [[L2]], align 4
-; CHECK-UNORDERED-NEXT:    [[L7]] = fadd float [[SUM_02]], [[L3]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[SUM_0_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_predicated(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-ORDERED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-ORDERED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; CHECK-ORDERED-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-ORDERED:       pred.load.if:
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK-ORDERED:       pred.load.continue:
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; CHECK-ORDERED-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK-ORDERED:       pred.load.if1:
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1
-; CHECK-ORDERED-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-ORDERED:       pred.load.continue2:
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP12]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
-; CHECK-ORDERED-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI]], <2 x float> [[TMP13]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT:    [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[L3:%.*]] = load float, float* [[L2]], align 4
-; CHECK-ORDERED-NEXT:    [[L7]] = fadd float [[SUM_02]], [[L3]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[SUM_0_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_predicated
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1
+; CHECK-ORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK-ORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[RDX_PHI:.*]] =  phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ]
+; CHECK-ORDERED: pred.load.continue2
+; CHECK-ORDERED: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ]
+; CHECK-ORDERED: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
+; CHECK-ORDERED: for.end:
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_predicated
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1
+; CHECK-UNORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK-UNORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] =  phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD:.*]], %pred.load.continue2 ]
+; CHECK-UNORDERED: %[[ICMP:.*]] = icmp ule <2 x i64> %vec.ind, %[[SPLAT]]
+; CHECK-UNORDERED: pred.load.continue2
+; CHECK-UNORDERED: %[[FADD]] = fadd <2 x float> %[[RDX_PHI]], {{.*}}
+; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[LOAD]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[SUM]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1355,95 +539,30 @@ for.end:                                            ; preds = %for.body
 
 ; Negative test - loop contains multiple fadds which we cannot safely reorder
 define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd <8 x float> [[TMP4]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP11]]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_multiple
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1469,95 +588,30 @@ for.end:                                         ; preds = %for.body
 ; Negative test - loop contains two fadds and only one fadd has the fast flag,
 ; which we cannot safely reorder.
 define float @fadd_multiple_one_flag(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3]] = fadd fast float [[ADD]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd fast <8 x float> [[TMP4]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP11]]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd fast float [[ADD]], [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD3]] = fadd fast float [[ADD]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd fast <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd fast float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1599,68 +653,14 @@ for.end:                                         ; preds = %for.body
 ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
 ; with the -hints-allow-reordering flag set to true.
 define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
-; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[X_014:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[X_014]], 2.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD3_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @induction_and_reduction(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[X_014:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[X_014]], 2.000000e+00
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD3_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @induction_and_reduction(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[X_014:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[X_014]], 2.000000e+00
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD3_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @induction_and_reduction
+; CHECK-ORDERED-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @induction_and_reduction
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1685,143 +685,50 @@ for.end:
 
 ; As above, but with the FP induction being unordered (fast) the loop can be vectorized with strict reductions
 define float @fast_induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
-; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[X_014:%.*]] = phi fast float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd fast float [[X_014]], 2.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD3_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[CAST_CRD]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i32 0
-; CHECK-UNORDERED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-UNORDERED-NEXT:    [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-UNORDERED-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP4]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd fast float [[X_014]], 2.000000e+00
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[SUM_015]], [[TMP11]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD3_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fast_induction_and_reduction(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[CAST_CRD]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i32 0
-; CHECK-ORDERED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-ORDERED-NEXT:    [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-ORDERED-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP4]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd fast float [[X_014]], 2.000000e+00
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD3]] = fadd float [[SUM_015]], [[TMP10]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD3_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fast_induction_and_reduction
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]]
+; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ]
+; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1848,80 +755,15 @@ for.end:
 ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
 ; with the -hints-allow-reordering flag set to true.
 define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) {
-; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[X_021:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[X_021]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd fast float [[X_021]], 2.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3]] = fadd float [[SUM_022]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[MUL]] = fmul float [[SUM2_023]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD6]]
-;
-; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[X_021:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    store float [[X_021]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd fast float [[X_021]], 2.000000e+00
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[SUM_022]], [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[MUL]] = fmul float [[SUM2_023]], [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD6]]
-;
-; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[X_021:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    store float [[X_021]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd fast float [[X_021]], 2.000000e+00
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD3]] = fadd float [[SUM_022]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[MUL]] = fmul float [[SUM2_023]], [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]]
-; CHECK-ORDERED-NEXT:    ret float [[ADD6]]
-;
 
+; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -1949,130 +791,59 @@ for.end:
 
 ; Test reductions for a VF of 1 and a UF > 1.
 define float @fadd_scalar_vf(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_scalar_vf(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT:    [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd float [[TMP4]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd float [[TMP5]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP10]] = fadd float [[TMP6]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[TMP11]] = fadd float [[TMP7]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd float [[TMP9]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd float [[TMP10]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd float [[TMP11]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP13]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_scalar_vf(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = fadd float [[VEC_PHI]], [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = fadd float [[TMP8]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = fadd float [[TMP9]], [[TMP6]]
-; CHECK-ORDERED-NEXT:    [[TMP11]] = fadd float [[TMP10]], [[TMP7]]
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP13]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_scalar_vf
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]]
+; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]]
+; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]]
+; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[FADD4]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
+; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_scalar_vf
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]]
+; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: scalar.ph
+; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf
+; CHECK-NOT-VECTORIZED-NOT: @vector.body
 
 entry:
   br label %for.body
@@ -2093,131 +864,59 @@ for.end:
 
 ; Same as above but where fadd has a fast-math flag.
 define float @fadd_scalar_vf_fmf(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT:    [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd nnan float [[TMP4]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd nnan float [[TMP5]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP10]] = fadd nnan float [[TMP6]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[TMP11]] = fadd nnan float [[TMP7]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd nnan float [[TMP9]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd nnan float [[TMP10]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd nnan float [[TMP11]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd nnan float [[TMP13]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = fadd nnan float [[VEC_PHI]], [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = fadd nnan float [[TMP8]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = fadd nnan float [[TMP9]], [[TMP6]]
-; CHECK-ORDERED-NEXT:    [[TMP11]] = fadd nnan float [[TMP10]], [[TMP7]]
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd nnan float [[TMP13]], [[SUM_07]]
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
-;
-
+; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-ORDERED: [[FADD1:%.*]] = fadd nnan float [[VEC_PHI]], [[LOAD1]]
+; CHECK-ORDERED: [[FADD2:%.*]] = fadd nnan float [[FADD1]], [[LOAD2]]
+; CHECK-ORDERED: [[FADD3:%.*]] = fadd nnan float [[FADD2]], [[LOAD3]]
+; CHECK-ORDERED: [[FADD4]] = fadd nnan float [[FADD3]], [[LOAD4]]
+; CHECK-ORDERED-NOT: @llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph:
+; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD4]], %middle.block ]
+; CHECK-ORDERED: for.body:
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-ORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]]
+; CHECK-ORDERED: for.end:
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[FADD4]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI4:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FADD1]] = fadd nnan float [[LOAD1]], [[VEC_PHI1]]
+; CHECK-UNORDERED: [[FADD2]] = fadd nnan float [[LOAD2]], [[VEC_PHI2]]
+; CHECK-UNORDERED: [[FADD3]] = fadd nnan float [[LOAD3]], [[VEC_PHI3]]
+; CHECK-UNORDERED: [[FADD4]] = fadd nnan float [[LOAD4]], [[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD2]], [[FADD1]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[FADD4]], [[BIN_RDX2]]
+; CHECK-UNORDERED: scalar.ph:
+; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]]
 ; CHECK-UORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -2238,102 +937,30 @@ for.end:
 
 ; Test case where the reduction step is a first-order recurrence.
 define double @reduction_increment_by_first_order_recurrence() {
-; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[LOOP:%.*]]
-; CHECK-NOT-VECTORIZED:       loop:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RED:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[FOR:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RED_NEXT]] = fadd double [[FOR]], [[RED]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[FOR_NEXT]] = sitofp i32 [[IV]] to double
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-NOT-VECTORIZED:       exit:
-; CHECK-NOT-VECTORIZED-NEXT:    [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret double [[RES]]
+; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-ORDERED:  vector.body:
+; CHECK-ORDERED:    [[RED:%.*]] = phi double [ 0.000000e+00, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
+; CHECK-ORDERED:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
+; CHECK-ORDERED:    [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
+; CHECK-ORDERED:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-ORDERED:    [[RED_NEXT]] = call double @llvm.vector.reduce.fadd.v4f64(double [[RED]], <4 x double> [[TMP1]])
+; CHECK-ORDERED:  scalar.ph:
+; CHECK-ORDERED:    = phi double [ 0.000000e+00, %entry ], [ [[RED_NEXT]], %middle.block ]
 ;
 ; CHECK-UNORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-UNORDERED-NEXT:    [[TMP2]] = fadd <4 x double> [[TMP1]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 0, 0
-; CHECK-UNORDERED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3
-; CHECK-UNORDERED-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP0]], i32 2
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[LOOP:%.*]]
-; CHECK-UNORDERED:       loop:
-; CHECK-UNORDERED-NEXT:    [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-UNORDERED-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-UNORDERED-NEXT:    [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]]
-; CHECK-UNORDERED-NEXT:    [[FOR_NEXT]] = sitofp i32 [[IV]] to double
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-UNORDERED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK-UNORDERED:       exit:
-; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret double [[RES]]
+; CHECK-UNORDERED:  vector.body:
+; CHECK-UNORDERED:    [[RED:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
+; CHECK-UNORDERED:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-UNORDERED:    [[RED_NEXT]] = fadd <4 x double> [[TMP1]], [[RED]]
+; CHECK-UNORDERED:  middle.block:
+; CHECK-UNORDERED:    [[RDX:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[RED_NEXT]])
+; CHECK-UNORDERED:  scalar.ph:
+; CHECK-UNORDERED:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[RDX]], %middle.block ]
 ;
-; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-ORDERED-NEXT:    [[TMP2]] = call double @llvm.vector.reduce.fadd.v4f64(double [[VEC_PHI]], <4 x double> [[TMP1]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
-; CHECK-ORDERED-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 0, 0
-; CHECK-ORDERED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3
-; CHECK-ORDERED-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP0]], i32 2
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[LOOP:%.*]]
-; CHECK-ORDERED:       loop:
-; CHECK-ORDERED-NEXT:    [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ORDERED-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ORDERED-NEXT:    [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]]
-; CHECK-ORDERED-NEXT:    [[FOR_NEXT]] = sitofp i32 [[IV]] to double
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-ORDERED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK-ORDERED:       exit:
-; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret double [[RES]]
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 ;
 entry:
   br label %loop
@@ -2356,73 +983,14 @@ exit:
 ; We should not mark the fadd as an ordered reduction here as there are
 ; more than 2 uses of the instruction
 define float @fadd_multiple_use(i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[FADD]] = fadd float [[RED]], 1.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nsw i64 [[PHI1]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
-; CHECK-NOT-VECTORIZED:       bb1:
-; CHECK-NOT-VECTORIZED-NEXT:    [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[PHI2]]
-; CHECK-NOT-VECTORIZED:       bb2:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[PHI3]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple_use(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ]
-; CHECK-UNORDERED-NEXT:    [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ]
-; CHECK-UNORDERED-NEXT:    [[FADD]] = fadd float [[RED]], 1.000000e+00
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nsw i64 [[PHI1]], 1
-; CHECK-UNORDERED-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
-; CHECK-UNORDERED:       bb1:
-; CHECK-UNORDERED-NEXT:    [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[PHI2]]
-; CHECK-UNORDERED:       bb2:
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[PHI3]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple_use(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ]
-; CHECK-ORDERED-NEXT:    [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ]
-; CHECK-ORDERED-NEXT:    [[FADD]] = fadd float [[RED]], 1.000000e+00
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nsw i64 [[PHI1]], 1
-; CHECK-ORDERED-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
-; CHECK-ORDERED:       bb1:
-; CHECK-ORDERED-NEXT:    [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[PHI2]]
-; CHECK-ORDERED:       bb2:
-; CHECK-ORDERED-NEXT:    [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ]
-; CHECK-ORDERED-NEXT:    ret float [[PHI3]]
-;
+; CHECK-ORDERED-LABEL: @fadd_multiple_use
 ; CHECK-ORDERED-LABEL-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @fadd_multiple_use
 ; CHECK-UNORDERED-LABEL-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -2451,189 +1019,59 @@ for.end:
 
 ; Test case where the loop has a call to the llvm.fmuladd intrinsic.
 define float @fmuladd_strict(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_strict(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP28]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP29]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP30]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP31]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP29]], [[TMP28]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP30]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP31]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_strict(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[TMP28]])
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP32]], <8 x float> [[TMP29]])
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP33]], <8 x float> [[TMP30]])
-; CHECK-ORDERED-NEXT:    [[TMP35]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP34]], <8 x float> [[TMP31]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.body:
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [  [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -2656,156 +1094,73 @@ for.end:
 
 ; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic.
 define float @fmuladd_scalar_vf(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT:    [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP16]] = call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP12]], float [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP17]] = call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP13]], float [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP18]] = call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP14]], float [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP19]] = call float @llvm.fmuladd.f32(float [[TMP7]], float [[TMP15]], float [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd float [[TMP17]], [[TMP16]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd float [[TMP18]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd float [[TMP19]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED:       vector.ph:
-; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED:       vector.body:
-; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = fmul float [[TMP4]], [[TMP12]]
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = fmul float [[TMP5]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = fmul float [[TMP6]], [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = fmul float [[TMP7]], [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = fadd float [[VEC_PHI]], [[TMP16]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = fadd float [[TMP20]], [[TMP17]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP23]] = fadd float [[TMP22]], [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; CHECK-ORDERED:       middle.block:
-; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD6:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD7:%.*]] = load float, float*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]]
+; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]]
+; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]]
+; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]]
+; CHECK-ORDERED: [[FADD3]] = fadd float [[FADD2]], [[FMUL3]]
+; CHECK-ORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD8:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD9:%.*]] = load float, float*
+; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FMULADD]] = call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: scalar.ph:
+; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ]
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -2828,62 +1183,14 @@ for.end:
 
 ; Test case where the reduction phi is one of the mul operands of the fmuladd.
 define float @fmuladd_phi_is_mul_operand(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-ORDERED-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -2906,56 +1213,14 @@ for.end:
 
 ; Test case where the reduction phi is two operands of the fmuladd.
 define float @fmuladd_phi_is_two_operands(float* %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-ORDERED-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -2977,134 +1242,37 @@ for.end:
 ; Test case with multiple calls to llvm.fmuladd, which is not safe to reorder
 ; so is only vectorized in the unordered (fast) case.
 define float @fmuladd_multiple(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_multiple(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED:       vector.ph:
-; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED:       vector.body:
-; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[TMP32]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[TMP28]])
-; CHECK-UNORDERED-NEXT:    [[TMP33]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[TMP29]])
-; CHECK-UNORDERED-NEXT:    [[TMP34]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[TMP30]])
-; CHECK-UNORDERED-NEXT:    [[TMP35]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[TMP31]])
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
-; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP33]], [[TMP32]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP34]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP35]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[MULADD]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_multiple(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_multiple
+; CHECK-ORDERED-NOT: vector.body:
+
+; CHECK-UNORDERED-LABEL: @fmuladd_multiple
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FMULADD:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[FMULADD]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD2:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
+; CHECK-UNORDERED: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[MULADD]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD2]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body:
 
 entry:
   br label %for.body
@@ -3128,65 +1296,14 @@ for.end:
 
 ; Same as above but the first fmuladd is one of the mul operands of the second fmuladd.
 define float @multiple_fmuladds_mul_operand(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-ORDERED-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -3210,65 +1327,14 @@ for.end:
 
 ; Same as above but the first fmuladd is two of the operands of the second fmuladd.
 define float @multiple_fmuladds_two_operands(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]])
-; CHECK-NOT-VECTORIZED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED:       for.end:
-; CHECK-NOT-VECTORIZED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]])
-; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT:    [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]])
-; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    ret float [[MULADD2_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-ORDERED-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   br label %for.body
@@ -3294,62 +1360,14 @@ declare float @llvm.fmuladd.f32(float, float, float)
 
 ; Test case with invariant store where fadd is strict.
 define void @reduction_store_to_invariant_address(float* %dst, float* readonly %src) {
-; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
-; CHECK-NOT-VECTORIZED-NEXT:  entry:
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 42
-; CHECK-NOT-VECTORIZED-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED:       for.body:
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NOT-VECTORIZED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT:    store float [[ADD]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NOT-VECTORIZED-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED:       for.cond.cleanup:
-; CHECK-NOT-VECTORIZED-NEXT:    ret void
-;
-; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
-; CHECK-UNORDERED-NEXT:  entry:
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 42
-; CHECK-UNORDERED-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED:       for.body:
-; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-UNORDERED-NEXT:    store float [[ADD]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-UNORDERED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED:       for.cond.cleanup:
-; CHECK-UNORDERED-NEXT:    ret void
-;
 ; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address(
-; CHECK-ORDERED-NEXT:  entry:
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 42
-; CHECK-ORDERED-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED:       for.body:
-; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-ORDERED-NEXT:    store float [[ADD]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-ORDERED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED:       for.cond.cleanup:
-; CHECK-ORDERED-NEXT:    ret void
-;
+; CHECK-ORDERED-NOT: vector.body
 
+; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-UNORDERED-NOT: vector.body
 
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
 
 entry:
   %arrayidx = getelementptr inbounds float, float* %dst, i64 42

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
index c114b014c6989..1021791a0dfb1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S | FileCheck %s
 
 
@@ -8,52 +7,11 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @cmpsel_i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @cmpsel_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP13]], 0
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL_NOT]], i32 2, i32 10
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[COND]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:         [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* {{.*}}, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK:         store <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32>* {{.*}}, align 4
 ;
 entry:
   br label %for.body
@@ -80,51 +38,11 @@ for.end:                                          ; preds = %for.end.loopexit, %
 define void @cmpsel_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @cmpsel_f32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP7]], <vscale x 4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP13]], 3.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[CMP1]], float 1.000000e+01, float 2.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CONV]], float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* {{.*}}, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK:         store <vscale x 4 x float> [[TMP2]], <vscale x 4 x float>* {{.*}}, align 4
 
 entry:
   br label %for.body
@@ -148,49 +66,10 @@ for.end:                                          ; preds = %for.body, %entry
 define void @fneg_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @fneg_f32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fneg <vscale x 4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP6]], <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[FNEG]], float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* {{.*}}, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <vscale x 4 x float> [[WIDE_LOAD]]
+; CHECK:         store <vscale x 4 x float> [[TMP1]], <vscale x 4 x float>* {{.*}}, align 4
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index a1b0e43c1a2aa..b8f941a7a4481 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -64,16 +64,16 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX8]], 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD10]])
 ; CHECK-NEXT:    [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI9]]
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
 ; CHECK-NEXT:    br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index 7b2cbca73ad8c..e3e219cc1601a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -64,15 +64,15 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX8]], 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP29]] = add <2 x i64> [[WIDE_LOAD10]], [[VEC_PHI9]]
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -91,7 +91,7 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[ADD]] = add i64 [[TMP32]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA4:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index 61b4febe16ce7..a32a6723e83bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -60,15 +60,15 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4
 ; CHECK-NEXT:    [[TMP27]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI8]], <2 x float> [[WIDE_LOAD9]])
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -86,7 +86,7 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[ADD]] = fadd float [[TMP29]], [[SUM_07]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA3:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 4ef3d4046614c..75b08fc2ff9f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -74,15 +74,15 @@ define void @main_vf_vscale_x_16(i8* %A) #0 {
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 1024, [[N_MOD_VF2]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX5]], 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, i8* [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to <vscale x 8 x i8>*
 ; CHECK-NEXT:    store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8>* [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 8
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], [[TMP32]]
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -151,13 +151,13 @@ define void @main_vf_vscale_x_16(i8* %A) #0 {
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vec.epilog.vector.body:
-; CHECK-VF8-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-VF8-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX2]], 0
 ; CHECK-VF8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP21]]
 ; CHECK-VF8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <8 x i8>*
 ; CHECK-VF8-NEXT:    store <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <8 x i8>* [[TMP24]], align 1
-; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
 ; CHECK-VF8-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
 ; CHECK-VF8-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
@@ -257,13 +257,13 @@ define void @main_vf_vscale_x_2(i64* %A) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX2]], 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
 ; CHECK-NEXT:    store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -332,13 +332,13 @@ define void @main_vf_vscale_x_2(i64* %A) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vec.epilog.vector.body:
-; CHECK-VF8-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-VF8-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX2]], 0
 ; CHECK-VF8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
 ; CHECK-VF8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
 ; CHECK-VF8-NEXT:    store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
-; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
 ; CHECK-VF8-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
 ; CHECK-VF8-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
index 81b07adcfecaa..56a53a5748a35 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
@@ -1,62 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-LABEL: @inv_store_last_lane(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP13:%.*]] = shl i32 [[TMP12]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i32> [[TMP6]], i32 [[TMP14]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP16]], 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[INV:%.*]], i64 42
-; CHECK-NEXT:    store i32 [[MUL_LCSSA]], i32* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @inv_store_last_lane
+; CHECK: vector.body:
+; CHECK:  store <vscale x 4 x i32> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
 
 entry:
   br label %for.body
@@ -79,56 +34,14 @@ exit:              ; preds = %for.body
 }
 
 define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-LABEL: @ret_last_lane(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP6]], <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP13:%.*]] = shl i32 [[TMP12]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x float> [[TMP6]], i32 [[TMP14]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP16]], 2.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @ret_last_lane
+; CHECK: vector.body:
+; CHECK:  store <vscale x 4 x float> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
index 85cb580b95f27..4d0886f4d953d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
@@ -1,51 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mattr=+sve -force-vector-width=4 -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s
 ; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS
 target triple = "aarch64-linux-gnu"
 
 ; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
 define dso_local void @loop_sve_i128(i128* nocapture %ptr, i64 %N) {
-; CHECK-LABEL: @loop_sve_i128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i128, i128* [[PTR:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[INDUCTION1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i128, i128* [[TMP1]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i128 [[TMP2]], 42
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i128 [[TMP3]], 42
-; CHECK-NEXT:    store i128 [[TMP4]], i128* [[TMP0]], align 16
-; CHECK-NEXT:    store i128 [[TMP5]], i128* [[TMP1]], align 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i128, i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i128 [[TMP7]], 42
-; CHECK-NEXT:    store i128 [[ADD]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @loop_sve_i128
+; CHECK: vector.body
+; CHECK:  %[[LOAD1:.*]] = load i128, i128* {{.*}}
+; CHECK-NEXT: %[[LOAD2:.*]] = load i128, i128* {{.*}}
+; CHECK-NEXT: %[[ADD1:.*]] = add nsw i128 %[[LOAD1]], 42
+; CHECK-NEXT: %[[ADD2:.*]] = add nsw i128 %[[LOAD2]], 42
+; CHECK-NEXT: store i128 %[[ADD1]], i128* {{.*}}
+; CHECK-NEXT: store i128 %[[ADD2]], i128* {{.*}}
 entry:
   br label %for.body
 
@@ -65,47 +31,14 @@ for.end:
 
 ; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
 define dso_local void @loop_sve_f128(fp128* nocapture %ptr, i64 %N) {
-; CHECK-LABEL: @loop_sve_f128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds fp128, fp128* [[PTR:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds fp128, fp128* [[PTR]], i64 [[INDUCTION1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load fp128, fp128* [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load fp128, fp128* [[TMP1]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fp128 [[TMP2]], 0xL00000000000000008000000000000000
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fp128 [[TMP3]], 0xL00000000000000008000000000000000
-; CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[TMP0]], align 16
-; CHECK-NEXT:    store fp128 [[TMP5]], fp128* [[TMP1]], align 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds fp128, fp128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load fp128, fp128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fp128 [[TMP7]], 0xL00000000000000008000000000000000
-; CHECK-NEXT:    store fp128 [[ADD]], fp128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @loop_sve_f128
+; CHECK: vector.body
+; CHECK: %[[LOAD1:.*]] = load fp128, fp128*
+; CHECK-NEXT: %[[LOAD2:.*]] = load fp128, fp128*
+; CHECK-NEXT: %[[FSUB1:.*]] = fsub fp128 %[[LOAD1]], 0xL00000000000000008000000000000000
+; CHECK-NEXT: %[[FSUB2:.*]] = fsub fp128 %[[LOAD2]], 0xL00000000000000008000000000000000
+; CHECK-NEXT: store fp128 %[[FSUB1]], fp128* {{.*}}
+; CHECK-NEXT: store fp128 %[[FSUB2]], fp128* {{.*}}
 entry:
   br label %for.body
 
@@ -125,41 +58,12 @@ for.end:
 
 ; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
 define dso_local void @loop_invariant_sve_i128(i128* nocapture %ptr, i128 %val, i64 %N) {
-; CHECK-LABEL: @loop_invariant_sve_i128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i128, i128* [[PTR:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[INDUCTION1]]
-; CHECK-NEXT:    store i128 [[VAL:%.*]], i128* [[TMP0]], align 16
-; CHECK-NEXT:    store i128 [[VAL]], i128* [[TMP1]], align 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT:    store i128 [[VAL]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @loop_invariant_sve_i128
+; CHECK: vector.body
+; CHECK: %[[GEP1:.*]] = getelementptr inbounds i128, i128* %ptr
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i128, i128* %ptr
+; CHECK-NEXT: store i128 %val, i128* %[[GEP1]]
+; CHECK-NEXT: store i128 %val, i128* %[[GEP2]]
 entry:
   br label %for.body
 
@@ -177,68 +81,15 @@ for.end:
 
 ; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
 define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
-; CHECK-LABEL: @uniform_store_i1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[START:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT]], <2 x i64*> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT3]], <2 x i64*> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i64* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> <i64 2, i64 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64*> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <2 x i64*> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <2 x i64*> [[TMP9]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
-; CHECK-NEXT:    store i1 [[TMP12]], i1* [[DST:%.*]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
-; CHECK-NEXT:    store i1 [[TMP13]], i1* [[DST]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
-; CHECK-NEXT:    store i1 [[TMP14]], i1* [[DST]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
-; CHECK-NEXT:    store i1 [[TMP15]], i1* [[DST]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[FIRST_SROA:%.*]] = phi i64* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[FIRST_SROA]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i64, i64* [[FIRST_SROA]], i64 1
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64* [[INCDEC_PTR]], [[START]]
-; CHECK-NEXT:    store i1 [[CMP_NOT]], i1* [[DST]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @uniform_store_i1
+; CHECK: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1
+; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]]
+; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0
+; CHECK: store i1 %[[EXTRACT1]], i1* %dst
+; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1
+; CHECK: store i1 %[[EXTRACT2]], i1* %dst
+; CHECK-NOT: vscale
 entry:
   br label %for.body
 
@@ -258,45 +109,11 @@ end:
 }
 
 define dso_local void @loop_fixed_width_i128(i128* nocapture %ptr, i64 %N) {
-; CHECK-LABEL: @loop_fixed_width_i128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, i128* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, i128* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128* [[TMP2]] to <4 x i128>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i128>, <4 x i128>* [[TMP3]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i128> [[WIDE_LOAD]], <i128 42, i128 42, i128 42, i128 42>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i128* [[TMP2]] to <4 x i128>*
-; CHECK-NEXT:    store <4 x i128> [[TMP4]], <4 x i128>* [[TMP5]], align 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i128, i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i128 [[TMP7]], 42
-; CHECK-NEXT:    store i128 [[ADD]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @loop_fixed_width_i128
+; CHECK: load <4 x i128>, <4 x i128>*
+; CHECK: add nsw <4 x i128> {{.*}}, <i128 42, i128 42, i128 42, i128 42>
+; CHECK: store <4 x i128> {{.*}} <4 x i128>*
+; CHECK-NOT: vscale
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 541bb93794620..1f9964b04457d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -loop-vectorize -S < %s -debug 2>%t | FileCheck %s
 ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
@@ -11,64 +10,26 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @induction_i7(i64* %dst) #0 {
 ; CHECK-LABEL: @induction_i7(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i7>
+; CHECK:         [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK:         [[TMP5:%.*]] = trunc <vscale x 2 x i8> %4 to <vscale x 2 x i7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i7> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i7> [[TMP6]], shufflevector (<vscale x 2 x i7> insertelement (<vscale x 2 x i7> poison, i7 1, i32 0), <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i7 @llvm.vscale.i7()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i7 [[TMP8]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i7 1, [[TMP9]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i7> poison, i7 [[TMP10]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i7> [[DOTSPLATINSERT]], <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i7> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = zext <vscale x 2 x i7> [[TMP12]] to <vscale x 2 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[TMP15]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64>* [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i7 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV1294:%.*]] = phi i7 [ [[INDVARS_IV_NEXT1295:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV1286:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1287:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDI7:%.*]] = add i7 [[INDVARS_IV1294]], 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[INDVARS_IV1286]]
-; CHECK-NEXT:    [[EXT:%.*]] = zext i7 [[ADDI7]] to i64
-; CHECK-NEXT:    store i64 [[EXT]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1287]] = add nuw nsw i64 [[INDVARS_IV1286]], 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1295]] = add i7 [[INDVARS_IV1294]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1287]], 64
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i7> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP10]]
+; CHECK-NEXT:    [[EXT:%.+]]  = zext <vscale x 2 x i7> [[TMP11]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <vscale x 2 x i64>*
+; CHECK-NEXT:    store <vscale x 2 x i64> [[EXT]], <vscale x 2 x i64>* [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[VEC_IND]], 
 ;
 entry:
   br label %for.body
@@ -96,62 +57,25 @@ for.end:                                          ; preds = %for.body
 
 define void @induction_i3_zext(i64* %dst) #0 {
 ; CHECK-LABEL: @induction_i3_zext(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i3>
+; CHECK:         [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK:         [[TMP5:%.*]] = trunc <vscale x 2 x i8> %4 to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i3> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i3> [[TMP6]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i32 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i3 @llvm.vscale.i3()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i3 [[TMP8]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i3 1, [[TMP9]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i3> poison, i3 [[TMP10]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i3> [[DOTSPLATINSERT]], <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i3> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64>* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i3 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV1294:%.*]] = phi i3 [ [[INDVARS_IV_NEXT1295:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV1286:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1287:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ZEXTI3:%.*]] = zext i3 [[INDVARS_IV1294]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[INDVARS_IV1286]]
-; CHECK-NEXT:    store i64 [[ZEXTI3]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1287]] = add nuw nsw i64 [[INDVARS_IV1286]], 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1295]] = add i3 [[INDVARS_IV1294]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1287]], 64
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i3> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <vscale x 2 x i64>*
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64>* [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[VEC_IND]], 
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
index 71614283a0b93..a12ec7f29c42b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
@@ -1,61 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu < %s | FileCheck %s
 
 define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) {
-; CHECK-LABEL: @invariant_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 42
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 42
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @invariant_load
+; CHECK: vector.body:
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
+; CHECK-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]]
+; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INVLOAD]], i32 0
+; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[SPLATINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
+; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[SPLAT]], %[[LOAD]]
+; CHECK: store <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>*
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index c4ce0fd3cf6dd..3d78b6f1f0ff1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -1,53 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -S < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @inv_store_i16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16*> poison, i16* [[DST:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:         %[[TMP1:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %dst, i32 0
+; CHECK-NEXT:    %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i16*> %[[TMP1]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16*> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY14:%.*]]
-; CHECK:       for.body14:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    store i16 [[LD]], i16* [[DST]], align 2
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.inc24:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         %[[VECLOAD:.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* %{{.*}}, align 2
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> %[[VECLOAD]], <vscale x 4 x i16*> %[[SPLAT_PTRS]], i32 2
 entry:
   br label %for.body14
 
@@ -67,55 +29,13 @@ for.inc24:                                        ; preds = %for.body14, %for.bo
 
 define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @cond_inv_store_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:         %[[TMP1:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %dst, i32 0
+; CHECK-NEXT:    %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i32*> %[[TMP1]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_09:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[I_09]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i32 [[TMP12]], i32* [[DST]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_09]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         %[[VECLOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %{{.*}}, align 4
+; CHECK-NEXT:    %[[MASK:.*]] = icmp sgt <vscale x 4 x i32> %[[VECLOAD]], zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[VECLOAD]], <vscale x 4 x i32*> %[[SPLAT_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
index 2567bf43350eb..1e40638703583 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
@@ -1,61 +1,16 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s
 
 define void @mloadstore_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mloadstore_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP9]], <vscale x 4 x float>* [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP6]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I_011]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP14]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I_011]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @mloadstore_f32
+; CHECK: vector.body:
+; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-NEXT:  %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
+; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr float, float* %a,
+; CHECK-NEXT:  %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
+; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT:  %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK-NEXT:  %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
+; CHECK-NEXT:  call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> %[[FADD]], <vscale x 4 x float>* %[[MSTORE_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 
@@ -83,60 +38,16 @@ exit:                                 ; preds = %for.inc
 }
 
 define void @mloadstore_i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mloadstore_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32>* [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP6]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_011]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP14]], 0
-; CHECK-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_011]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @mloadstore_i32
+; CHECK: vector.body:
+; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
+; CHECK-NEXT:  %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
+; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr i32, i32* %a,
+; CHECK-NEXT:  %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
+; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT:  %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]
+; CHECK-NEXT:  %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
+; CHECK-NEXT:  call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[FADD]], <vscale x 4 x i32>* %[[MSTORE_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 84c1cd883c41e..38456ad1da9d5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -1,161 +1,44 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
 ; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
 
 target triple = "aarch64-linux-gnu"
 
 define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp(
-; CHECK-VF4IC1-NEXT:  entry:
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1:       vector.ph:
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1:       vector.body:
-; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-VF4IC1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4IC1:       middle.block:
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3
-; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1:       scalar.ph:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1:       for.body:
-; CHECK-VF4IC1-NEXT:    [[TMP14:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP15:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP17]], 3
-; CHECK-VF4IC1-NEXT:    [[TMP19]] = select i1 [[TMP18]], i32 [[TMP15]], i32 7
-; CHECK-VF4IC1-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP14]], 1
-; CHECK-VF4IC1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP21]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-VF4IC1:       exit:
-; CHECK-VF4IC1-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp(
-; CHECK-VF4IC4-NEXT:  entry:
-; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4:       vector.ph:
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4:       vector.body:
-; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP39:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP40:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP41:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP42]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP43]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP44]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP45]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 16
-; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]]
-; CHECK-VF4IC4-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4IC4:       middle.block:
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[TMP43]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP7:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT8:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP7]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[TMP44]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP9:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT10:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP9]], <vscale x 4 x i32> [[RDX_SELECT8]], <vscale x 4 x i32> [[TMP45]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP11:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP11]])
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT12:%.*]] = select i1 [[TMP49]], i32 7, i32 3
-; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4:       scalar.ph:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4:       for.body:
-; CHECK-VF4IC4-NEXT:    [[TMP50:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP51:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP55:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP50]]
-; CHECK-VF4IC4-NEXT:    [[TMP53:%.*]] = load i32, i32* [[TMP52]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP53]], 3
-; CHECK-VF4IC4-NEXT:    [[TMP55]] = select i1 [[TMP54]], i32 [[TMP51]], i32 7
-; CHECK-VF4IC4-NEXT:    [[TMP56]] = add nuw nsw i64 [[TMP50]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[TMP56]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP57]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-VF4IC4:       exit:
-; CHECK-VF4IC4-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP55]], [[FOR_BODY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[DOTLCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+
+; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC4:      vector.body:
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1]] = select <vscale x 4 x i1> [[VEC_ICMP1]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2]] = select <vscale x 4 x i1> [[VEC_ICMP2]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3]] = select <vscale x 4 x i1> [[VEC_ICMP3]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4]] = select <vscale x 4 x i1> [[VEC_ICMP4]], <vscale x 4 x i32> [[VEC_PHI4]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4:      middle.block:
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP5]], <vscale x 4 x i32> [[VEC_SEL1]], <vscale x 4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP6]], <vscale x 4 x i32> [[VEC_SEL5]], <vscale x 4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP7]], <vscale x 4 x i32> [[VEC_SEL6]], <vscale x 4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 entry:
   br label %for.body
 
@@ -175,181 +58,26 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_i32_from_icmp(
-; CHECK-VF4IC1-NEXT:  entry:
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1:       vector.ph:
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-VF4IC1-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i32 0
-; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1:       vector.body:
-; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-VF4IC1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-VF4IC1:       middle.block:
-; CHECK-VF4IC1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], [[DOTSPLAT]]
-; CHECK-VF4IC1-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]]
-; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1:       scalar.ph:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1:       for.body:
-; CHECK-VF4IC1-NEXT:    [[TMP14:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP15:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP17]], 3
-; CHECK-VF4IC1-NEXT:    [[TMP19]] = select i1 [[TMP18]], i32 [[TMP15]], i32 [[B]]
-; CHECK-VF4IC1-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP14]], 1
-; CHECK-VF4IC1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP21]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-VF4IC1:       exit:
-; CHECK-VF4IC1-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_i32_from_icmp(
-; CHECK-VF4IC4-NEXT:  entry:
-; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4:       vector.ph:
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-VF4IC4-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i32 0
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i32 0
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i32 0
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i32 0
-; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4:       vector.body:
-; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP39:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP40:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP41:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP42]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
-; CHECK-VF4IC4-NEXT:    [[TMP43]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> [[BROADCAST_SPLAT8]]
-; CHECK-VF4IC4-NEXT:    [[TMP44]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> [[BROADCAST_SPLAT10]]
-; CHECK-VF4IC4-NEXT:    [[TMP45]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> [[BROADCAST_SPLAT12]]
-; CHECK-VF4IC4-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 16
-; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]]
-; CHECK-VF4IC4-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-VF4IC4:       middle.block:
-; CHECK-VF4IC4-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP42]], [[DOTSPLAT]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[TMP43]]
-; CHECK-VF4IC4-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT13]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP15:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], [[DOTSPLAT14]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT16:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP15]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[TMP44]]
-; CHECK-VF4IC4-NEXT:    [[DOTSPLATINSERT17:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT:    [[DOTSPLAT18:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT17]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP19:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT16]], [[DOTSPLAT18]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT20:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP19]], <vscale x 4 x i32> [[RDX_SELECT16]], <vscale x 4 x i32> [[TMP45]]
-; CHECK-VF4IC4-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT21]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP23:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT20]], [[DOTSPLAT22]]
-; CHECK-VF4IC4-NEXT:    [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP23]])
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT24:%.*]] = select i1 [[TMP49]], i32 [[B]], i32 [[A]]
-; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4:       scalar.ph:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT24]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4:       for.body:
-; CHECK-VF4IC4-NEXT:    [[TMP50:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP51:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP55:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP50]]
-; CHECK-VF4IC4-NEXT:    [[TMP53:%.*]] = load i32, i32* [[TMP52]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP53]], 3
-; CHECK-VF4IC4-NEXT:    [[TMP55]] = select i1 [[TMP54]], i32 [[TMP51]], i32 [[B]]
-; CHECK-VF4IC4-NEXT:    [[TMP56]] = add nuw nsw i64 [[TMP50]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[TMP56]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP57]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-VF4IC4:       exit:
-; CHECK-VF4IC4-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP55]], [[FOR_BODY]] ], [ [[RDX_SELECT24]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[DOTLCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC1:      vector.ph:
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[FIN_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+
+; CHECK-VF4IC4-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC4:      vector.body:
 entry:
   br label %for.body
 
@@ -369,157 +97,19 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp(
-; CHECK-VF4IC1-NEXT:  entry:
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1:       vector.ph:
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1:       vector.body:
-; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-VF4IC1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-VF4IC1:       middle.block:
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2
-; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1:       scalar.ph:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1:       for.body:
-; CHECK-VF4IC1-NEXT:    [[TMP14:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP15:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC1-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP18:%.*]] = fcmp fast ueq float [[TMP17]], 3.000000e+00
-; CHECK-VF4IC1-NEXT:    [[TMP19]] = select i1 [[TMP18]], i32 [[TMP15]], i32 1
-; CHECK-VF4IC1-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP14]], 1
-; CHECK-VF4IC1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP21]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF4IC1:       exit:
-; CHECK-VF4IC1-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp(
-; CHECK-VF4IC4-NEXT:  entry:
-; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4:       vector.ph:
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4:       vector.body:
-; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP38:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP39:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP40:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP41:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP42]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP43]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP44]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP45]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 16
-; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]]
-; CHECK-VF4IC4-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-VF4IC4:       middle.block:
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[TMP43]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP7:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT8:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP7]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[TMP44]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP9:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT10:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP9]], <vscale x 4 x i32> [[RDX_SELECT8]], <vscale x 4 x i32> [[TMP45]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP11:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP11]])
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT12:%.*]] = select i1 [[TMP49]], i32 1, i32 2
-; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4:       scalar.ph:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4:       for.body:
-; CHECK-VF4IC4-NEXT:    [[TMP50:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP51:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP55:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP50]]
-; CHECK-VF4IC4-NEXT:    [[TMP53:%.*]] = load float, float* [[TMP52]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP54:%.*]] = fcmp fast ueq float [[TMP53]], 3.000000e+00
-; CHECK-VF4IC4-NEXT:    [[TMP55]] = select i1 [[TMP54]], i32 [[TMP51]], i32 1
-; CHECK-VF4IC4-NEXT:    [[TMP56]] = add nuw nsw i64 [[TMP50]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[TMP56]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP57]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF4IC4:       exit:
-; CHECK-VF4IC4-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP55]], [[FOR_BODY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[DOTLCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+
+; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC4:      vector.body:
 entry:
   br label %for.body
 
@@ -539,40 +129,10 @@ exit:                                     ; preds = %for.body
 }
 
 define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp(
-; CHECK-VF4IC1-NEXT:  entry:
-; CHECK-VF4IC1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1:       for.body:
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP0]]
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
-; CHECK-VF4IC1-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; CHECK-VF4IC1-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N:%.*]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-VF4IC1:       exit:
-; CHECK-VF4IC1-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    ret float [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp(
-; CHECK-VF4IC4-NEXT:  entry:
-; CHECK-VF4IC4-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4:       for.body:
-; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP0]]
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
-; CHECK-VF4IC4-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; CHECK-VF4IC4-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N:%.*]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-VF4IC4:       exit:
-; CHECK-VF4IC4-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    ret float [[DOTLCSSA]]
-;
+; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp
+; CHECK-VF4IC1-NOT: vector.body
+; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp
+; CHECK-VF4IC4-NOT: vector.body
 entry:
   br label %for.body
 
@@ -592,214 +152,22 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF4IC1-NEXT:  entry:
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1:       vector.ph:
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1:       vector.body:
-; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[SRC2:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC1-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC1-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC1-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; CHECK-VF4IC1-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-VF4IC1:       middle.block:
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
-; CHECK-VF4IC1-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0
-; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1:       scalar.ph:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1:       for.body:
-; CHECK-VF4IC1-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF4IC1-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF4IC1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF4IC1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-VF4IC1:       if.then:
-; CHECK-VF4IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF4IC1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-VF4IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF4IC1-NEXT:    br label [[FOR_INC]]
-; CHECK-VF4IC1:       for.inc:
-; CHECK-VF4IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-VF4IC1:       for.end.loopexit:
-; CHECK-VF4IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[R_1_LCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF4IC4-NEXT:  entry:
-; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4:       vector.ph:
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4:       vector.body:
-; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT:    [[TMP38:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP39:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP40:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP41:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[SRC2:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT:    [[TMP43:%.*]] = getelementptr i32, i32* [[SRC2]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[SRC2]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[TMP45:%.*]] = getelementptr i32, i32* [[SRC2]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP42]], i32 0
-; CHECK-VF4IC4-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP47]], i32 4, <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT:    [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP49:%.*]] = mul i32 [[TMP48]], 4
-; CHECK-VF4IC4-NEXT:    [[TMP50:%.*]] = getelementptr i32, i32* [[TMP42]], i32 [[TMP49]]
-; CHECK-VF4IC4-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP51]], i32 4, <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP53:%.*]] = mul i32 [[TMP52]], 8
-; CHECK-VF4IC4-NEXT:    [[TMP54:%.*]] = getelementptr i32, i32* [[TMP42]], i32 [[TMP53]]
-; CHECK-VF4IC4-NEXT:    [[TMP55:%.*]] = bitcast i32* [[TMP54]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP55]], i32 4, <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT:    [[TMP56:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT:    [[TMP57:%.*]] = mul i32 [[TMP56]], 12
-; CHECK-VF4IC4-NEXT:    [[TMP58:%.*]] = getelementptr i32, i32* [[TMP42]], i32 [[TMP57]]
-; CHECK-VF4IC4-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP59]], i32 4, <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT:    [[TMP60:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP61:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP62:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP63:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP64:%.*]] = select <vscale x 4 x i1> [[TMP60]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT:    [[TMP65:%.*]] = select <vscale x 4 x i1> [[TMP61]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[TMP66:%.*]] = select <vscale x 4 x i1> [[TMP62]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[TMP67:%.*]] = select <vscale x 4 x i1> [[TMP63]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI3]]
-; CHECK-VF4IC4-NEXT:    [[TMP68:%.*]] = xor <vscale x 4 x i1> [[TMP38]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP69:%.*]] = xor <vscale x 4 x i1> [[TMP39]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP70:%.*]] = xor <vscale x 4 x i1> [[TMP40]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[TMP71:%.*]] = xor <vscale x 4 x i1> [[TMP41]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[TMP64]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT:    [[PREDPHI10]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[TMP65]], <vscale x 4 x i32> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[PREDPHI11]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[TMP66]], <vscale x 4 x i32> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[PREDPHI12]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[TMP67]], <vscale x 4 x i32> [[VEC_PHI3]]
-; CHECK-VF4IC4-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 16
-; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP73]]
-; CHECK-VF4IC4-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-VF4IC4:       middle.block:
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[PREDPHI10]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP13:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT14:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP13]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[PREDPHI11]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP15:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT14]], zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT16:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP15]], <vscale x 4 x i32> [[RDX_SELECT14]], <vscale x 4 x i32> [[PREDPHI12]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP17:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT16]], zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[TMP75:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP17]])
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT18:%.*]] = select i1 [[TMP75]], i32 1, i32 0
-; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4:       scalar.ph:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT18]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4:       for.body:
-; CHECK-VF4IC4-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF4IC4-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF4IC4-NEXT:    [[TMP76:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP76]], 35
-; CHECK-VF4IC4-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-VF4IC4:       if.then:
-; CHECK-VF4IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF4IC4-NEXT:    [[TMP77:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP77]], 2
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF4IC4-NEXT:    br label [[FOR_INC]]
-; CHECK-VF4IC4:       for.inc:
-; CHECK-VF4IC4-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-VF4IC4:       for.end.loopexit:
-; CHECK-VF4IC4-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT18]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[R_1_LCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0
+
+; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-VF4IC4:      vector.body:
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index edc74d0297bd2..c1cc2d7592559 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize < %s | FileCheck %s
 
 ; These tests ensure that tail-folding is enabled when the predicate.enable
@@ -11,7 +10,7 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -21,36 +20,24 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT6]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %middle.block, label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 42d79a96b21b8..8ca671854fccb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -hints-allow-reordering=false -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -prefer-inloop-reductions < %s | FileCheck %s
 ; RUN: opt -S -hints-allow-reordering=false -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-inloop-reductions < %s | FileCheck %s
 
@@ -11,7 +10,7 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -21,36 +20,24 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT6]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %middle.block, label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -72,7 +59,7 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memcpy(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -82,40 +69,26 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -142,7 +115,7 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
@@ -162,15 +135,15 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 4, [[TMP12]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT4]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IV]], i32 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP16]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
@@ -182,23 +155,9 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP21]], label %middle.block, label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -222,7 +181,7 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no
 ; CHECK-LABEL: @simple_gather_scatter(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -234,40 +193,24 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> undef)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %middle.block, label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[IND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[IND_VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[SRC]], i32 [[IND_VAL]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IND_VAL]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP3]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -294,7 +237,7 @@ while.end.loopexit:                               ; preds = %while.body
 define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -304,38 +247,25 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %for.end, label %scalar.ph
 ;
 
 entry:
@@ -362,7 +292,7 @@ for.end:                                          ; preds = %for.body, %entry
 define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_uniform_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -372,56 +302,35 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i3
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[SRC:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> undef)
-; CHECK-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <vscale x 4 x i1> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]])
-; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[BROADCAST_SPLAT6]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32>* [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label %middle.block, label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP21]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %for.end, label %scalar.ph
 ;
 
 entry:
@@ -456,7 +365,7 @@ for.end:                                          ; preds = %for.inc, %entry
 define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -466,38 +375,25 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT4]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %middle.block, label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[DST]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %for.end, label %scalar.ph
 ;
 
 entry:
@@ -521,7 +417,7 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_fdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -531,45 +427,29 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP12]], <vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP16:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP16]], <vscale x 4 x float>* [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %middle.block, label %vector.body, !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load float, float* [[GEP1]], align 4
-; CHECK-NEXT:    [[VAL2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT:    [[RES:%.*]] = fdiv float [[VAL1]], [[VAL2]]
-; CHECK-NEXT:    store float [[RES]], float* [[GEP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -595,7 +475,7 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @add_reduction_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -605,42 +485,26 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[TMP14:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP14]] = add i32 [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = add i32 [[RED]], [[VAL]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -663,7 +527,7 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @add_reduction_f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -673,41 +537,25 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP10]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[TMP14:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %middle.block, label %vector.body, !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load float, float* [[GEP]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = fadd float [[RED]], [[VAL]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    br i1 true, label %while.end.loopexit, label %scalar.ph
 ;
 entry:
   br label %while.body
@@ -729,7 +577,7 @@ while.end.loopexit:                               ; preds = %while.body
 define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
 ; CHECK-LABEL: @cond_xor_reduction(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -739,10 +587,10 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, %vector.ph ], [ [[TMP16:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]]
@@ -764,32 +612,7 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP20]], 5
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[RDX]], [[TMP21]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
-;
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label %scalar.ph
 entry:
   br label %for.body
 
@@ -823,21 +646,7 @@ for.end:
 ; divides for scalable vectors we just don't bother vectorizing.
 define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_idiv(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[VAL2:%.*]] = load i32, i32* [[GEP2]], align 4
-; CHECK-NEXT:    [[RES:%.*]] = udiv i32 [[VAL1]], [[VAL2]]
-; CHECK-NEXT:    store i32 [[RES]], i32* [[GEP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    ret void
+; CHECK-NOT:   vector.body
 ;
 entry:
   br label %while.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-type-conv.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-type-conv.ll
index 857ae98df88a6..221a6a67bb041 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-type-conv.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-type-conv.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -dce -instcombine < %s -S | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -6,50 +5,8 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @f16_to_f32(float* noalias nocapture %dst, half* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @f16_to_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds half, half* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP4]] to <vscale x 8 x half>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, <vscale x 8 x half>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT:    store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, half* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load half, half* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP12]] to float
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = fpext <vscale x 8 x half> %{{.*}} to <vscale x 8 x float>
 entry:
   br label %for.body
 
@@ -71,50 +28,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @f64_to_f32(float* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @f64_to_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 8 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x double>, <vscale x 8 x double>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = fptrunc <vscale x 8 x double> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT:    store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP12]] to float
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = fptrunc <vscale x 8 x double> %{{.*}} to <vscale x 8 x float>
 entry:
   br label %for.body
 
@@ -136,50 +51,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @f16_to_s8(i8* noalias nocapture %dst, half* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @f16_to_s8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds half, half* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP4]] to <vscale x 8 x half>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, <vscale x 8 x half>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = fptosi <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP6]], <vscale x 8 x i8>* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, half* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load half, half* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV1:%.*]] = fptosi half [[TMP12]] to i8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    store i8 [[CONV1]], i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = fptosi <vscale x 8 x half> %{{.*}} to <vscale x 8 x i8>
 entry:
   br label %for.body
 
@@ -201,50 +74,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @f32_to_u64(i64* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @f32_to_u64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fptoui <vscale x 8 x float> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <vscale x 8 x i64>*
-; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP6]], <vscale x 8 x i64>* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CONV:%.*]] = fptoui float [[TMP12]] to i64
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store i64 [[CONV]], i64* [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = fptoui <vscale x 8 x float> %{{.*}} to <vscale x 8 x i64>
 entry:
   br label %for.body
 
@@ -266,50 +97,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @s8_to_f32(float* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @s8_to_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT:    store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i8 [[TMP12]] to float
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = sitofp <vscale x 8 x i8> %{{.*}} to <vscale x 8 x float>
 entry:
   br label %for.body
 
@@ -331,50 +120,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @u16_to_f32(float* noalias nocapture %dst, i16* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @u16_to_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <vscale x 8 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, <vscale x 8 x i16>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = uitofp <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT:    store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i16 [[TMP12]] to float
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = uitofp <vscale x 8 x i16> %{{.*}} to <vscale x 8 x float>
 entry:
   br label %for.body
 
@@ -396,50 +143,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @u64_to_f16(half* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @u64_to_f16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 8 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = uitofp <vscale x 8 x i64> [[WIDE_LOAD]] to <vscale x 8 x half>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds half, half* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast half* [[TMP7]] to <vscale x 8 x half>*
-; CHECK-NEXT:    store <vscale x 8 x half> [[TMP6]], <vscale x 8 x half>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CONV1:%.*]] = uitofp i64 [[TMP12]] to half
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    store half [[CONV1]], half* [[ARRAYIDX2]], align 2
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:      vector.body
+; CHECK:        %{{.*}} = uitofp <vscale x 8 x i64> %{{.*}} to <vscale x 8 x half>
 entry:
   br label %for.body
 
@@ -461,50 +166,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @s64_to_f16(half* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @s64_to_f16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 8 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <vscale x 8 x i64> [[WIDE_LOAD]] to <vscale x 8 x half>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds half, half* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast half* [[TMP7]] to <vscale x 8 x half>*
-; CHECK-NEXT:    store <vscale x 8 x half> [[TMP6]], <vscale x 8 x half>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CONV1:%.*]] = sitofp i64 [[TMP12]] to half
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    store half [[CONV1]], half* [[ARRAYIDX2]], align 2
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:      vector.body
+; CHECK:        %{{.*}} = sitofp <vscale x 8 x i64> %{{.*}} to <vscale x 8 x half>
 entry:
   br label %for.body
 
@@ -526,50 +189,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @s8_to_s32(i32* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @s8_to_s32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    store <vscale x 8 x i32> [[TMP6]], <vscale x 8 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP12]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = sext <vscale x 8 x i8> %{{.*}} to <vscale x 8 x i32>
 entry:
   br label %for.body
 
@@ -591,50 +212,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @u8_to_u16(i16* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @u8_to_u16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <vscale x 8 x i16>*
-; CHECK-NEXT:    store <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP12]] to i16
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store i16 [[CONV]], i16* [[ARRAYIDX1]], align 2
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = zext <vscale x 8 x i8> %{{.*}} to <vscale x 8 x i16>
 entry:
   br label %for.body
 
@@ -656,50 +235,8 @@ for.end:                                          ; preds = %for.body, %entry
 
 define void @s64_to_s8(i8* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @s64_to_s8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 8 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <vscale x 8 x i64> [[WIDE_LOAD]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP6]], <vscale x 8 x i8>* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP12]] to i8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_07]]
-; CHECK-NEXT:    store i8 [[CONV]], i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body
+; CHECK:   %{{.*}} = trunc <vscale x 8 x i64> %{{.*}} to <vscale x 8 x i8>
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index fb092315b5252..660eda6dc567b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
 ; experimental.vector.reverse
 
@@ -17,92 +16,12 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 {
-; CHECK-LABEL: @vector_reverse_mask_nxv4i1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[COND:%.*]], i64 [[N]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[COND]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[INDEX]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], [[N]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul i32 [[TMP7]], -4
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[DOTNEG]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[TMP10]] to <vscale x 4 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, <vscale x 4 x double>* [[TMP11]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP12:%.*]] = fcmp une <vscale x 4 x double> [[REVERSE]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[DOTNEG10:%.*]] = mul i32 [[TMP14]], -4
-; CHECK-NEXT:    [[TMP15:%.*]] = or i32 [[DOTNEG10]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, double* [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP12]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double* [[TMP17]] to <vscale x 4 x double>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* [[TMP18]], i32 8, <vscale x 4 x i1> [[REVERSE6]], <vscale x 4 x double> poison), !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <vscale x 4 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[DOTNEG11:%.*]] = mul i32 [[TMP20]], -4
-; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[DOTNEG11]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, double* [[TMP13]], i64 [[TMP22]]
-; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP12]])
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast double* [[TMP23]] to <vscale x 4 x double>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> [[TMP19]], <vscale x 4 x double>* [[TMP24]], i32 8, <vscale x 4 x i1> [[REVERSE9]]), !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = shl i64 [[TMP25]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I_08]] = add nsw i64 [[I_08_IN]], -1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[TMP28]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load double, double* [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP29]], 1.000000e+00
-; CHECK-NEXT:    store double [[ADD]], double* [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
-;
+; CHECK-LABEL: vector.body:
+; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
+; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[WIDEMSKLOAD]]
+; CHECK:  %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
 
 entry:
   %cmp7 = icmp sgt i64 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
index 0c626241301fe..ebe13c0c83302 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
@@ -1,59 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @widen_extractvalue(i64* %dst, {i64, i64} %sv) #0 {
 ; CHECK-LABEL: @widen_extractvalue(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 0, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 0, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 0, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64 } [[SV:%.*]], 0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[DST:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, i64* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 0, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_BODY:%.*]]
-; CHECK:       loop.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_BODY]] ]
-; CHECK-NEXT:    [[A:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; CHECK-NEXT:    [[B:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i64, i64* [[DST]], i32 [[IV]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[A]], [[B]]
-; CHECK-NEXT:    store i64 [[ADD]], i64* [[ADDR]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK: vector.body:
+; CHECK:        [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV:%.*]], 0
+; CHECK-NEXT:   [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i32 0
+; CHECK-NEXT:   [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:   [[EXTRACT1:%.*]] = extractvalue { i64, i64 } [[SV]], 1
+; CHECK-NEXT:   [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT1]], i32 0
+; CHECK-NEXT:   [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK:        [[ADD:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[DOTSPLAT2]]
 entry:
   br label %loop.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index bb6765cb47f4e..bf0aba1931d12 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -scalable-vectorization=off -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s
 
 ; NOTE: These tests aren't really target-specific, but it's convenient to target AArch64
@@ -10,43 +9,21 @@ target triple = "aarch64-linux-gnu"
 ; we don't artificially create new predicated blocks for the load.
 define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IDX]], 0
+; CHECK-NEXT:    [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n)
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4
+; CHECK-NOT:     load i32, i32* %src, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]])
+; CHECK-NEXT:    [[IDX_NEXT]] = add i64 [[IDX]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec
+; CHECK-NEXT:    br i1 [[CMP]], label %middle.block, label %vector.body
 
 entry:
   br label %for.body
@@ -70,79 +47,18 @@ for.end:                                          ; preds = %for.body, %entry
 ; and the original condition.
 define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_uniform_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DST1:%.*]] = bitcast i32* [[DST:%.*]] to i8*
-; CHECK-NEXT:    [[COND3:%.*]] = bitcast i32* [[COND:%.*]] to i8*
-; CHECK-NEXT:    [[SRC6:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 [[N:%.*]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[COND]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[SRC]], i64 1
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DST1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[COND3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[DST1]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[SRC6]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[SRC]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:         [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0
+; CHECK-NEXT:    [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX12]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope !4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IDX]], 0
+; CHECK-NEXT:    [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n)
+; CHECK:         [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT]], i32 4, <4 x i1> [[TMP6]], <4 x i32> undef), !alias.scope !7
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[PREDPHI]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP9]]), !alias.scope !9, !noalias !11
-; CHECK-NEXT:    [[INDEX_NEXT13]] = add i64 [[INDEX12]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP14]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef)
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
index 66f586268a0b1..14367c462030e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -vector-library=Darwin_libsystem_m -inject-tli-mappings -loop-vectorize -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -7,57 +6,8 @@ target triple = "arm64-apple-darwin"
 declare float @expf(float) nounwind readnone
 define void @expf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @expf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_exp_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_exp_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float [[LV]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_exp_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -80,57 +30,8 @@ for.end:
 declare double @exp(double) nounwind readnone
 define void @exp_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @exp_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_exp_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_exp_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @exp(double [[LV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_exp_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -153,57 +54,8 @@ for.end:
 declare float @acosf(float) nounwind readnone
 define void @acos_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @acos_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_acos_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_acos_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @acosf(float [[LV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_acos_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -226,57 +78,8 @@ for.end:
 declare double @acos(double) nounwind readnone
 define void @acos_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @acos_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_acos_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_acos_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @acos(double [[LV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_acos_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -299,57 +102,8 @@ for.end:
 declare float @asinf(float) nounwind readnone
 define void @asinf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @asinf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_asin_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_asin_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @asinf(float [[LV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_asin_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -372,57 +126,8 @@ for.end:
 declare double @asin(double) nounwind readnone
 define void @asin_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @asin_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_asin_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_asin_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @asin(double [[LV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_asin_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -442,60 +147,11 @@ for.end:
   ret void
 }
 
-  declare float @atanf(float) nounwind readnone
+ declare float @atanf(float) nounwind readnone
 define void @atanf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @atanf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_atan_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_atan_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @atanf(float [[LV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_atan_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -518,57 +174,8 @@ for.end:
 declare double @atan(double) nounwind readnone
 define void @atan_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @atan_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_atan_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_atan_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @atan(double [[LV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_atan_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -591,57 +198,8 @@ for.end:
 declare float @atan2f(float) nounwind readnone
 define void @atan2f_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @atan2f_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_atan2_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_atan2_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @atan2f(float [[LV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_atan2_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -664,57 +222,8 @@ for.end:
 declare double @atan2(double) nounwind readnone
 define void @atan2_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @atan2_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_atan2_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_atan2_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @atan2(double [[LV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_atan2_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -737,57 +246,8 @@ for.end:
 declare float @cosf(float) nounwind readnone
 define void @cosf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @cosf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_cos_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_cos_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @cosf(float [[LV]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_cos_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -810,57 +270,8 @@ for.end:
 declare double @cos(double) nounwind readnone
 define void @cos_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @cos_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_cos_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_cos_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cos(double [[LV]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_cos_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -883,57 +294,8 @@ for.end:
 declare float @cbrtf(float) nounwind readnone
 define void @cbrtf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @cbrtf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_cbrt_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_cbrt_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @cbrtf(float [[LV]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_cbrt_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -956,57 +318,8 @@ for.end:
 declare double @cbrt(double) nounwind readnone
 define void @cbrt_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @cbrt_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_cbrt_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_cbrt_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cbrt(double [[LV]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_cbrt_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1029,57 +342,8 @@ for.end:
 declare float @erff(float) nounwind readnone
 define void @erff_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @erff_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_erf_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_erf_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @erff(float [[LV]]) #[[ATTR15:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_erf_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1102,57 +366,8 @@ for.end:
 declare double @erf(double) nounwind readnone
 define void @erf_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @erf_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_erf_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_erf_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @erf(double [[LV]]) #[[ATTR16:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_erf_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1175,57 +390,8 @@ for.end:
 declare float @powf(float) nounwind readnone
 define void @powf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @powf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_pow_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_pow_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @powf(float [[LV]]) #[[ATTR17:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_pow_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1248,57 +414,8 @@ for.end:
 declare double @pow(double) nounwind readnone
 define void @pow_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @pow_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_pow_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_pow_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @pow(double [[LV]]) #[[ATTR18:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_pow_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1321,57 +438,8 @@ for.end:
 declare float @sinhf(float) nounwind readnone
 define void @sinhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @sinhf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_sinh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_sinh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @sinhf(float [[LV]]) #[[ATTR19:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_sinh_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1394,57 +462,8 @@ for.end:
 declare double @sinh(double) nounwind readnone
 define void @sinh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @sinh_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_sinh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_sinh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sinh(double [[LV]]) #[[ATTR20:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_sinh_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1467,57 +486,8 @@ for.end:
 declare float @coshf(float) nounwind readnone
 define void @coshf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @coshf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_cosh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_cosh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @coshf(float [[LV]]) #[[ATTR21:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_cosh_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1540,57 +510,8 @@ for.end:
 declare double @cosh(double) nounwind readnone
 define void @cosh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @cosh_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_cosh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_cosh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cosh(double [[LV]]) #[[ATTR22:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_cosh_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1613,57 +534,8 @@ for.end:
 declare float @tanhf(float) nounwind readnone
 define void @tanhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @tanhf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_tanh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_tanh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @tanhf(float [[LV]]) #[[ATTR23:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_tanh_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1686,57 +558,8 @@ for.end:
 declare double @tanh(double) nounwind readnone
 define void @tanh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @tanh_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_tanh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_tanh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @tanh(double [[LV]]) #[[ATTR24:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_tanh_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1759,57 +582,8 @@ for.end:
 declare float @asinhf(float) nounwind readnone
 define void @asinhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @asinhf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_asinh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_asinh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @asinhf(float [[LV]]) #[[ATTR25:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_asinh_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1832,57 +606,8 @@ for.end:
 declare double @asinh(double) nounwind readnone
 define void @asinh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @asinh_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_asinh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_asinh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @asinh(double [[LV]]) #[[ATTR26:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_asinh_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1905,57 +630,8 @@ for.end:
 declare float @acoshf(float) nounwind readnone
 define void @acoshf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @acoshf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_acosh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_acosh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @acoshf(float [[LV]]) #[[ATTR27:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_acosh_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -1978,57 +654,8 @@ for.end:
 declare double @acosh(double) nounwind readnone
 define void @acosh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @acosh_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_acosh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_acosh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @acosh(double [[LV]]) #[[ATTR28:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_acosh_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -2051,57 +678,8 @@ for.end:
 declare float @atanhf(float) nounwind readnone
 define void @atanhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
 ; CHECK-LABEL: @atanhf_v4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @_simd_atanh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @_simd_atanh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @atanhf(float [[LV]]) #[[ATTR29:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP59:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <4 x float> @_simd_atanh_f4(
+; CHECK: ret void
 
 entry:
   br label %for.body
@@ -2124,57 +702,8 @@ for.end:
 declare double @atanh(double) nounwind readnone
 define void @atanh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
 ; CHECK-LABEL: @atanh_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @_simd_atanh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @_simd_atanh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP60:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @atanh(double [[LV]]) #[[ATTR30:[0-9]+]]
-; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP61:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: call <2 x double> @_simd_atanh_d2(
+; CHECK: ret void
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
index 23ede3284690d..311dfdb068ca5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test VLA for reverse with fixed size vector
 ; This is the loop in c++ being vectorize in this file with
 ; shuffle reverse
@@ -9,71 +8,20 @@
 ; RUN: opt -loop-vectorize -dce  -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 {
-; CHECK-LABEL: @vector_reverse_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[B3:%.*]] = bitcast double* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[B]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 -7
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <8 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x double>, <8 x double>* [[TMP5]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x double> [[WIDE_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <8 x double> [[REVERSE]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[TMP8]], i32 -7
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP9]] to <8 x double>*
-; CHECK-NEXT:    store <8 x double> [[REVERSE6]], <8 x double>* [[TMP10]], align 8, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I_08]] = add nsw i64 [[I_08_IN]], -1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]]
-; CHECK-NEXT:    store double [[ADD]], double* [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
-;
+; CHECK-LABEL: vector_reverse_f64
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>*
+; CHECK-NEXT:  store <8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]], align 8
 
 entry:
   %cmp7 = icmp sgt i64 %N, 0
@@ -95,71 +43,20 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 {
-; CHECK-LABEL: @vector_reverse_i64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = bitcast i64* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[B3:%.*]] = bitcast i64* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i64, i64* [[A]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i64* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i64, i64* [[B]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i64* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i32 -7
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <8 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, <8 x i64>* [[TMP5]], align 8, !alias.scope !9
-; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i64> [[REVERSE]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP8]], i32 -7
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <8 x i64>*
-; CHECK-NEXT:    store <8 x i64> [[REVERSE6]], <8 x i64>* [[TMP10]], align 8, !alias.scope !12, !noalias !9
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I_09]] = add nsw i64 [[I_09_IN]], -1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[TMP12]], 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]]
-; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
-;
+; CHECK-LABEL: vector_reverse_i64
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>*
+; CHECK-NEXT:  store <8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]], align 8
 
 entry:
   %cmp8 = icmp sgt i64 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
index 96a6f257ef041..6d1fa6f36a9dc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
@@ -20,70 +19,6 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 ; CHECK-NOT: <4 x float>
 
 define void @_Z4testmm(i32 %size, i32 %offset) {
-; CHECK-LABEL: @_Z4testmm(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP53:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[R_057:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[G_056:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V_055:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[B_054:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[V_055]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[ADD]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 [[V_055]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 [[V_055]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 [[V_055]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 [[V_055]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
-; CHECK-NEXT:    [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
-; CHECK-NEXT:    [[ARRAYIDX_SUM:%.*]] = add i32 [[MUL]], 1
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[ARRAYIDX_SUM]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
-; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
-; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
-; CHECK-NEXT:    [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
-; CHECK-NEXT:    [[ARRAYIDX_SUM52:%.*]] = add i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[ARRAYIDX_SUM52]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX21]], align 4
-; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
-; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
-; CHECK-NEXT:    [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
-; CHECK-NEXT:    [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
-; CHECK-NEXT:    [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
-; CHECK-NEXT:    [[INC]] = add i32 [[V_055]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
-; CHECK-NEXT:    [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
-; CHECK-NEXT:    [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    store i8 [[R_0_LCSSA]], i8* @r_, align 4
-; CHECK-NEXT:    store i8 [[G_0_LCSSA]], i8* @g_, align 4
-; CHECK-NEXT:    store i8 [[B_0_LCSSA]], i8* @b_, align 4
-; CHECK-NEXT:    ret void
-;
 entry:
   %cmp53 = icmp eq i32 %size, 0
   br i1 %cmp53, label %for.end, label %for.body.lr.ph

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index a8e53448034be..bb9ba563da6b9 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -217,19 +217,19 @@ define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -240,8 +240,8 @@ define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 3
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
 ; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
@@ -282,19 +282,19 @@ define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -305,8 +305,8 @@ define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 4
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
 ; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
@@ -347,22 +347,22 @@ define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], [[STRIDE]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i32 [[TMP1]], [[STRIDE]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -373,8 +373,8 @@ define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
 ; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
@@ -417,20 +417,20 @@ define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocaptu
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 3, i32 11, i32 19, i32 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <4 x i32> [[TMP5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 32, i32 32, i32 32, i32 32>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -444,8 +444,8 @@ define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocaptu
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP13]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
 ; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
@@ -539,20 +539,20 @@ define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -566,8 +566,8 @@ define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP15]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
 ; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index 604cd8ab880d0..28dba6af29c67 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
@@ -387,4 +386,4 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-attributes #0 = { "target-features"="+mve" }
+attributes #0 = { "target-features"="+mve" }
\ No newline at end of file

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
index 715762551ea2b..50b80e0082ac4 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
@@ -40,7 +40,7 @@ define void @arm_abs_q7(i8* nocapture readonly %pSrc, i8* nocapture %pDst, i32 %
 ; CHECK-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@@ -64,7 +64,7 @@ define void @arm_abs_q7(i8* nocapture readonly %pSrc, i8* nocapture %pDst, i32 %
 ; CHECK-NEXT:    store i8 [[COND11]], i8* [[PDST_ADDR_020]], align 1
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_021]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -135,7 +135,7 @@ define void @arm_abs_q15(i16* nocapture readonly %pSrc, i16* nocapture %pDst, i3
 ; CHECK-NEXT:    store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2, !alias.scope !11, !noalias !8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@@ -159,7 +159,7 @@ define void @arm_abs_q15(i16* nocapture readonly %pSrc, i16* nocapture %pDst, i3
 ; CHECK-NEXT:    store i16 [[COND11]], i16* [[PDST_ADDR_021]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_022]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -230,7 +230,7 @@ define void @arm_abs_q31(i32* nocapture readonly %pSrc, i32* nocapture %pDst, i3
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !18, !noalias !15
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@@ -254,7 +254,7 @@ define void @arm_abs_q31(i32* nocapture readonly %pSrc, i32* nocapture %pDst, i3
 ; CHECK-NEXT:    store i32 [[COND6]], i32* [[PDST_ADDR_015]], align 4
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_016]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP21:!llvm.loop !.*]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index d76a35155dfbe..5bf6e242a651d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -20,8 +20,8 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -76,8 +76,8 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP10]] = add i32 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index 6c9bb9700f63f..785b093239028 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -20,8 +20,8 @@ define void @arm_offset_q15(i16* nocapture readonly %pSrc, i16 signext %offset,
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[BLOCKSIZE]], 7
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT6]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT8]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -30,7 +30,7 @@ define void @arm_offset_q15(i16* nocapture readonly %pSrc, i16 signext %offset,
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> [[BROADCAST_SPLAT7]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> [[BROADCAST_SPLAT9]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP5]] to <8 x i16>*
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[TMP1]], <8 x i16>* [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
index 5dcd99579a0cb..275a8145346c6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -65,7 +65,7 @@ define i32 @test(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
@@ -91,7 +91,7 @@ define i32 @test(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
 ; CHECK-NEXT:    [[INC129]] = add nuw nsw i32 [[I_2132]], 1
 ; CHECK-NEXT:    [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
-; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       outerend:
 ; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
index 60c128700c9d9..4cb9e47e75a6a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
 ; RUN:  FileCheck %s
 
@@ -7,59 +6,10 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 ; Test that ARMTTIImpl::preferPredicateOverEpilogue triggers tail-folding.
 
 define dso_local void @f1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_09]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-LABEL: f1(
+; CHECK:       entry:
+; CHECK:       @llvm.get.active.lane.mask
+; CHECK:       }
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
@@ -88,65 +38,10 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) {
-; CHECK-LABEL: @f32_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr float, float* [[INPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: f32_reduction(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[INPUT]], i32 [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi float* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[INPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_08:%.*]] = phi float [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INPUT_ADDR_07:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[INPUT_ADDR_07]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[INPUT_ADDR_07]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP8]], [[SUM_08]]
-; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[N]] to float
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast float [[SUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT:    store float [[DIV]], float* [[OUTPUT:%.*]], align 4
-; CHECK-NEXT:    ret void
-;
+; CHECK:       @llvm.masked.load
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 entry:
   %cmp6 = icmp eq i32 %N, 0
   br i1 %cmp6, label %while.end, label %while.body.preheader
@@ -178,65 +73,10 @@ while.end:                                        ; preds = %while.end.loopexit,
 }
 
 define dso_local void @f16_reduction(half* nocapture readonly %Input, i32 %N, half* nocapture %Output) {
-; CHECK-LABEL: @f16_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr half, half* [[INPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: f16_reduction(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr half, half* [[INPUT]], i32 [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, half* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[TMP2]] to <8 x half>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x half> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x half> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x half> [[TMP4]], <8 x half> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP5]])
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi half* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[INPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, [[WHILE_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_08:%.*]] = phi half [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INPUT_ADDR_07:%.*]] = phi half* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds half, half* [[INPUT_ADDR_07]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load half, half* [[INPUT_ADDR_07]], align 2
-; CHECK-NEXT:    [[ADD]] = fadd fast half [[TMP8]], [[SUM_08]]
-; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[N]] to half
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast half [[SUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT:    store half [[DIV]], half* [[OUTPUT:%.*]], align 2
-; CHECK-NEXT:    ret void
-;
+; CHECK:       @llvm.masked.load
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 entry:
   %cmp6 = icmp eq i32 %N, 0
   br i1 %cmp6, label %while.end, label %while.body.preheader
@@ -268,88 +108,10 @@ while.end:                                        ; preds = %while.end.loopexit,
 }
 
 define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) {
-; CHECK-LABEL: @mixed_f32_i32_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP15]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr float, float* [[FINPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END4:%.*]] = getelementptr i32, i32* [[IINPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: mixed_f32_i32_reduction(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[FINPUT]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[IINPUT]], i32 [[TMP1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP2]], i32 [[N]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[NEXT_GEP6]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD7]], [[VEC_PHI5]]
-; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP10]], <4 x float> [[VEC_PHI5]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP11]])
-; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi float* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[FINPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32* [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[IINPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi float [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_020:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ISUM_019:%.*]] = phi i32 [ [[ADD2:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[FSUM_018:%.*]] = phi float [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[FINPUT_ADDR_017:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IINPUT_ADDR_016:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[FINPUT_ADDR_017]], i32 1
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[IINPUT_ADDR_016]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[IINPUT_ADDR_016]], align 4
-; CHECK-NEXT:    [[ADD2]] = add nsw i32 [[TMP14]], [[ISUM_019]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[FINPUT_ADDR_017]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP15]], [[FSUM_018]]
-; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_020]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD2]], [[WHILE_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[PHITMP:%.*]] = sitofp i32 [[ADD2_LCSSA]] to float
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[FSUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[ISUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[PHITMP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[N]] to float
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast float [[FSUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT:    store float [[DIV]], float* [[FOUTPUT:%.*]], align 4
-; CHECK-NEXT:    [[DIV5:%.*]] = fdiv fast float [[ISUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT:    [[CONV6:%.*]] = fptosi float [[DIV5]] to i32
-; CHECK-NEXT:    store i32 [[CONV6]], i32* [[IOUTPUT:%.*]], align 4
-; CHECK-NEXT:    ret void
-;
+; CHECK:       @llvm.masked.load
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 entry:
   %cmp15 = icmp eq i32 %N, 0
   br i1 %cmp15, label %while.end, label %while.body.preheader
@@ -392,54 +154,10 @@ while.end:
 }
 
 define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) {
-; CHECK-LABEL: @i32_mul_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: i32_mul_reduction(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[MUL_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[S_07:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[TMP8]], [[S_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
+; CHECK:       @llvm.masked.load
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 entry:
   %cmp6 = icmp sgt i32 %N, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
@@ -467,54 +185,10 @@ for.body:
 }
 
 define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) {
-; CHECK-LABEL: @i32_or_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: i32_or_reduction(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[S_07:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[OR]] = or i32 [[TMP8]], [[S_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
+; CHECK:       @llvm.masked.load
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 entry:
   %cmp6 = icmp sgt i32 %N, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
@@ -542,55 +216,10 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) {
-; CHECK-LABEL: @i32_and_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[S:%.*]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: i32_and_reduction(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[S]], [[FOR_BODY_PREHEADER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[S_ADDR_0_LCSSA:%.*]] = phi i32 [ [[S]], [[ENTRY:%.*]] ], [ [[AND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_ADDR_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[S_ADDR_06:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND]] = and i32 [[TMP9]], [[S_ADDR_06]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
+; CHECK:       @llvm.masked.load
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 entry:
   %cmp5 = icmp sgt i32 %N, 0
   br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
index dfcc315581ae8..41622bde65315 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
 ; RUN:  FileCheck %s
 
@@ -11,51 +10,11 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 ;   preferPredicateOverEpilogue: hardware-loop is not profitable.
 ;
 define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {
-; CHECK-LABEL: @tail_folding(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: tail_folding(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 430, 428
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-NOT:   call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; CHECK-NOT:   call void @llvm.masked.store.v4i32.p0v4i32(
+; CHECK:       br i1 %{{.*}}, label %{{.*}}, label %vector.body
 entry:
   br label %for.body
 
@@ -80,51 +39,17 @@ for.body:
 ; tail-folded.
 ;
 define dso_local void @predicate_loop_hint(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {
-; CHECK-LABEL: @predicate_loop_hint(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: predicate_loop_hint(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 430)
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
+; CHECK:         %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         %[[ELEM0:.*]] = add i64 %index, 0
+; CHECK:         %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430)
+; CHECK:         %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; CHECK:         %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; CHECK:         %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
+; CHECK:         call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
+; CHECK:         %index.next = add i64 %index, 4
+; CHECK:         br i1 %{{.*}}, label %{{.*}}, label %vector.body
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
index a0fd07cabc529..96ef73e34d22d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
 ; RUN:  FileCheck %s -check-prefix=CHECK
 
@@ -13,97 +12,24 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 ; overrules this.
 ;
 define dso_local void @flag_overrules_hint(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
-; CHECK-LABEL: @flag_overrules_hint(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: flag_overrules_hint(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 430, 428
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; PREDFLAG-LABEL: @flag_overrules_hint(
-; PREDFLAG-NEXT:  entry:
-; PREDFLAG-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDFLAG:       vector.ph:
-; PREDFLAG-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDFLAG:       vector.body:
-; PREDFLAG-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDFLAG-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; PREDFLAG-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 430)
-; PREDFLAG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; PREDFLAG-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; PREDFLAG-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; PREDFLAG-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; PREDFLAG-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; PREDFLAG-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; PREDFLAG-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PREDFLAG-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; PREDFLAG-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; PREDFLAG:       middle.block:
-; PREDFLAG-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; PREDFLAG:       scalar.ph:
-; PREDFLAG-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; PREDFLAG-NEXT:    br label [[FOR_BODY:%.*]]
-; PREDFLAG:       for.cond.cleanup:
-; PREDFLAG-NEXT:    ret void
-; PREDFLAG:       for.body:
-; PREDFLAG-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; PREDFLAG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; PREDFLAG-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; PREDFLAG-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; PREDFLAG-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; PREDFLAG-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; PREDFLAG-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; PREDFLAG-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-NOT:   @llvm.masked.load.v8i32.p0v8i32(
+; CHECK-NOT:   @llvm.masked.store.v8i32.p0v8i32(
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
 
+; PREDFLAG-LABEL: flag_overrules_hint(
+; PREDFLAG:  vector.body:
+; PREDFLAG:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREDFLAG:  %[[ELEM0:.*]] = add i64 %index, 0
+; PREDFLAG:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430)
+; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  %index.next = add i64 %index, 4
+; PREDFLAG:  %[[CMP:.*]] = icmp eq i64 %index.next, 432
+; PREDFLAG:  br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
 entry:
   br label %for.body
 
@@ -125,201 +51,29 @@ for.body:
 }
 
 define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
-; CHECK-LABEL: @interleave4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 12
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP2]], i32 [[N]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP3]], i32 [[N]])
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP13]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP21]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 4
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP23]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 12
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD7]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_MASKED_LOAD4]]
-; CHECK-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD5]]
-; CHECK-NEXT:    [[TMP31:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD6]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP28]], <4 x i32>* [[TMP37]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 4
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP29]], <4 x i32>* [[TMP39]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]])
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP30]], <4 x i32>* [[TMP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]])
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 12
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP31]], <4 x i32>* [[TMP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_09]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDFLAG-LABEL: interleave4(
+; PREDFLAG:  %[[ADD1:.*]] = add i32 %index, 0
+; PREDFLAG:  %[[ADD2:.*]] = add i32 %index, 4
+; PREDFLAG:  %[[ADD3:.*]] = add i32 %index, 8
+; PREDFLAG:  %[[ADD4:.*]] = add i32 %index, 12
+; PREDFLAG:  %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N)
+; PREDFLAG:  %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N)
+; PREDFLAG:  %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N)
+; PREDFLAG:  %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %N)
+;
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
 ;
-; PREDFLAG-LABEL: @interleave4(
-; PREDFLAG-NEXT:  entry:
-; PREDFLAG-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; PREDFLAG-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; PREDFLAG:       for.body.preheader:
-; PREDFLAG-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDFLAG:       vector.ph:
-; PREDFLAG-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
-; PREDFLAG-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
-; PREDFLAG-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDFLAG-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDFLAG:       vector.body:
-; PREDFLAG-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDFLAG-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; PREDFLAG-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; PREDFLAG-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 8
-; PREDFLAG-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 12
-; PREDFLAG-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; PREDFLAG-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; PREDFLAG-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP2]], i32 [[N]])
-; PREDFLAG-NEXT:    [[ACTIVE_LANE_MASK3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP3]], i32 [[N]])
-; PREDFLAG-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP1]]
-; PREDFLAG-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP2]]
-; PREDFLAG-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP3]]
-; PREDFLAG-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; PREDFLAG-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; PREDFLAG-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; PREDFLAG-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP13]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; PREDFLAG-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP1]]
-; PREDFLAG-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP2]]
-; PREDFLAG-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP3]]
-; PREDFLAG-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; PREDFLAG-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP21]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 4
-; PREDFLAG-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP23]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8
-; PREDFLAG-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 12
-; PREDFLAG-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; PREDFLAG-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD7]], [[WIDE_MASKED_LOAD]]
-; PREDFLAG-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_MASKED_LOAD4]]
-; PREDFLAG-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD5]]
-; PREDFLAG-NEXT:    [[TMP31:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD6]]
-; PREDFLAG-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; PREDFLAG-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; PREDFLAG-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP3]]
-; PREDFLAG-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0
-; PREDFLAG-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; PREDFLAG-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP28]], <4 x i32>* [[TMP37]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; PREDFLAG-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 4
-; PREDFLAG-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; PREDFLAG-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP29]], <4 x i32>* [[TMP39]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]])
-; PREDFLAG-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8
-; PREDFLAG-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
-; PREDFLAG-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP30]], <4 x i32>* [[TMP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]])
-; PREDFLAG-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 12
-; PREDFLAG-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <4 x i32>*
-; PREDFLAG-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP31]], <4 x i32>* [[TMP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]])
-; PREDFLAG-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; PREDFLAG-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDFLAG-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PREDFLAG:       middle.block:
-; PREDFLAG-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; PREDFLAG:       scalar.ph:
-; PREDFLAG-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; PREDFLAG-NEXT:    br label [[FOR_BODY:%.*]]
-; PREDFLAG:       for.cond.cleanup.loopexit:
-; PREDFLAG-NEXT:    br label [[FOR_COND_CLEANUP]]
-; PREDFLAG:       for.cond.cleanup:
-; PREDFLAG-NEXT:    ret void
-; PREDFLAG:       for.body:
-; PREDFLAG-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; PREDFLAG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; PREDFLAG-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; PREDFLAG-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; PREDFLAG-NEXT:    [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; PREDFLAG-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]]
-; PREDFLAG-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; PREDFLAG-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; PREDFLAG-NEXT:    [[INC]] = add nuw nsw i32 [[I_09]], 1
-; PREDFLAG-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; PREDFLAG-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
index d1b1d4f3144ac..7ecfd2fd744ab 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
@@ -23,18 +23,18 @@ define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -47,8 +47,8 @@ define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %
 ; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP7]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
 ; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       end:

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-altivec.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-altivec.ll
index c655627789456..eafc140b90c3b 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-altivec.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-altivec.ll
@@ -1,7 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=MASSV -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -mattr=-altivec -S < %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-n32:64"
+target datalayout = "e-m:e-i64:64-n32:64" 
 target triple = "powerpc64le-unknown-linux-gnu"
 
 declare double @cbrt(double) #0
@@ -14,20 +13,8 @@ declare float @atanhf(float) #0
 ; Check that massv entries are not generated.
 define void @cbrt_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @cbrt_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cbrt(double [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __cbrtd2{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -49,20 +36,8 @@ for.end:
 
 define void @cbrt_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @cbrt_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @cbrtf(float [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __cbrtf4{{.*}}<4 x float>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -84,20 +59,8 @@ for.end:
 
 define void @atanh_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @atanh_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @atanh(double [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __atanhd2{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -119,20 +82,8 @@ for.end:
 
 define void @atanh_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @atanh_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @atanhf(float [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __atanhf4{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
index 93c17ee2c987b..30fe5f96cc5bf 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=MASSV -mtriple=powerpc64le-unknown-linux-gnu -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
 ; RUN: opt -vector-library=MASSV -vec-extabi -mattr=+altivec -mtriple=powerpc64-ibm-aix-xcoff -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
 
@@ -88,6 +87,10 @@ declare double @atanh(double) #0
 declare float @atanhf(float) #0
 
 define void @cbrt_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cbrt_f64(
+; CHECK: __cbrtd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -107,6 +110,10 @@ for.end:
 }
 
 define void @cbrt_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cbrt_f32(
+; CHECK: __cbrtf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -126,6 +133,10 @@ for.end:
 }
 
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:  __powd2{{.*}}<2 x double>
+; CHECK:  ret void
+;
 entry:
   br label %for.body
 
@@ -147,6 +158,10 @@ for.end:
 }
 
 define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK: __powd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -168,6 +183,10 @@ for.end:
 }
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK: __powf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -189,6 +208,10 @@ for.end:
 }
 
 define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK: __powf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -211,20 +234,8 @@ for.end:
 
 define void @sqrt_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sqrt_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __sqrtd2{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -246,20 +257,8 @@ for.end:
 
 define void @sqrt_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sqrt_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @sqrtf(float [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __sqrtf4{{.*}}<4 x float>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -280,6 +279,10 @@ for.end:
 }
 
 define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK: __expd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -299,6 +302,10 @@ for.end:
 }
 
 define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK: __expd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -318,6 +325,10 @@ for.end:
 }
 
 define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK: __expf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -337,6 +348,10 @@ for.end:
 }
 
 define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK: __expf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -356,6 +371,10 @@ for.end:
 }
 
 define void @exp2_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp2_f64(
+; CHECK: __exp2d2{{.*}}<2 x double>
+; CHECK:  ret void
+;
 entry:
   br label %for.body
 
@@ -375,6 +394,10 @@ for.end:
 }
 
 define void @exp2_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp2_f64_intrinsic(
+; CHECK: __exp2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -394,6 +417,10 @@ for.end:
 }
 
 define void @exp2_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp2_f32(
+; CHECK: __exp2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -413,6 +440,10 @@ for.end:
 }
 
 define void @exp2_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp2_f32_intrinsic(
+; CHECK: __exp2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -432,6 +463,10 @@ for.end:
 }
 
 define void @expm1_f64(double* nocapture %varray) {
+; CHECK-LABEL: @expm1_f64(
+; CHECK: __expm1d2{{.*}}<2 x double>
+; CHECK:  ret void
+;
 entry:
   br label %for.body
 
@@ -451,6 +486,10 @@ for.end:
 }
 
 define void @expm1_f32(float* nocapture %varray) {
+; CHECK-LABEL: @expm1_f32(
+; CHECK: __expm1f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -470,6 +509,10 @@ for.end:
 }
 
 define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK: __logd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -489,6 +532,10 @@ for.end:
 }
 
 define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK: __logd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -508,6 +555,10 @@ for.end:
 }
 
 define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK: __logf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -527,6 +578,10 @@ for.end:
 }
 
 define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK: __logf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -546,6 +601,10 @@ for.end:
 }
 
 define void @log1p_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log1p_f64(
+; CHECK: __log1pd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -565,6 +624,10 @@ for.end:
 }
 
 define void @log1p_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log1p_f32(
+; CHECK: __log1pf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -584,6 +647,10 @@ for.end:
 }
 
 define void @log10_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log10_f64(
+; CHECK: __log10d2(<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -603,6 +670,10 @@ for.end:
 }
 
 define void @log10_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log10_f64_intrinsic(
+; CHECK: __log10d2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -622,6 +693,10 @@ for.end:
 }
 
 define void @log10_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log10_f32(
+; CHECK: __log10f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -641,6 +716,10 @@ for.end:
 }
 
 define void @log10_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log10_f32_intrinsic(
+; CHECK: __log10f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -660,6 +739,10 @@ for.end:
 }
 
 define void @log2_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log2_f64(
+; CHECK: __log2d2(<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -679,6 +762,10 @@ for.end:
 }
 
 define void @log2_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log2_f64_intrinsic(
+; CHECK: __log2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -698,6 +785,10 @@ for.end:
 }
 
 define void @log2_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log2_f32(
+; CHECK: __log2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -717,6 +808,10 @@ for.end:
 }
 
 define void @log2_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log2_f32_intrinsic(
+; CHECK: __log2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -736,6 +831,10 @@ for.end:
 }
 
 define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK: __sind2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -755,6 +854,10 @@ for.end:
 }
 
 define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK: __sind2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -774,6 +877,10 @@ for.end:
 }
 
 define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK: __sinf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -793,6 +900,10 @@ for.end:
 }
 
 define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK: __sinf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -812,6 +923,10 @@ for.end:
 }
 
 define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK: __cosd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -831,6 +946,10 @@ for.end:
 }
 
 define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <2 x double> @__cosd2(<2 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
 entry:
   br label %for.body
 
@@ -850,6 +969,10 @@ for.end:
 }
 
 define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK: __cosf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -869,6 +992,10 @@ for.end:
 }
 
 define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK: __cosf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -888,6 +1015,10 @@ for.end:
 }
 
 define void @tan_f64(double* nocapture %varray) {
+; CHECK-LABEL: @tan_f64(
+; CHECK: __tand2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -907,6 +1038,10 @@ for.end:
 }
 
 define void @tan_f32(float* nocapture %varray) {
+; CHECK-LABEL: @tan_f32(
+; CHECK: __tanf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -926,6 +1061,10 @@ for.end:
 }
 
 define void @asin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @asin_f64(
+; CHECK: __asind2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -945,6 +1084,10 @@ for.end:
 }
 
 define void @asin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @asin_f32(
+; CHECK: __asinf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -964,6 +1107,10 @@ for.end:
 }
 
 define void @acos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @acos_f64(
+; CHECK: __acosd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -983,6 +1130,10 @@ for.end:
 }
 
 define void @acos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @acos_f32(
+; CHECK: __acosf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1002,6 +1153,10 @@ for.end:
 }
 
 define void @atan_f64(double* nocapture %varray) {
+; CHECK-LABEL: @atan_f64(
+; CHECK: __atand2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1021,6 +1176,10 @@ for.end:
 }
 
 define void @atan_f32(float* nocapture %varray) {
+; CHECK-LABEL: @atan_f32(
+; CHECK: __atanf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1040,6 +1199,10 @@ for.end:
 }
 
 define void @atan2_f64(double* nocapture %varray) {
+; CHECK-LABEL: @atan2_f64(
+; CHECK: __atan2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1059,6 +1222,10 @@ for.end:
 }
 
 define void @atan2_f32(float* nocapture %varray) {
+; CHECK-LABEL: @atan2_f32(
+; CHECK: __atan2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1078,6 +1245,10 @@ for.end:
 }
 
 define void @sinh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sinh_f64(
+; CHECK: __sinhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1097,6 +1268,10 @@ for.end:
 }
 
 define void @sinh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sinh_f32(
+; CHECK: __sinhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1116,6 +1291,10 @@ for.end:
 }
 
 define void @cosh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cosh_f64(
+; CHECK: __coshd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1135,6 +1314,10 @@ for.end:
 }
 
 define void @cosh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cosh_f32(
+; CHECK: __coshf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1154,6 +1337,10 @@ for.end:
 }
 
 define void @tanh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @tanh_f64(
+; CHECK: __tanhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1173,6 +1360,10 @@ for.end:
 }
 
 define void @tanh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @tanh_f32(
+; CHECK: __tanhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1192,6 +1383,10 @@ for.end:
 }
 
 define void @asinh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @asinh_f64(
+; CHECK: __asinhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1211,6 +1406,10 @@ for.end:
 }
 
 define void @asinh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @asinh_f32(
+; CHECK: __asinhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1230,6 +1429,10 @@ for.end:
 }
 
 define void @acosh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @acosh_f64(
+; CHECK: __acoshd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1249,6 +1452,10 @@ for.end:
 }
 
 define void @acosh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @acosh_f32(
+; CHECK: __acoshf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1268,6 +1475,10 @@ for.end:
 }
 
 define void @atanh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @atanh_f64(
+; CHECK: __atanhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1287,6 +1498,10 @@ for.end:
 }
 
 define void @atanh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @atanh_f32(
+; CHECK: __atanhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-nobuiltin.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-nobuiltin.ll
index 9d312e1f1e73e..5e713a551f32b 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-nobuiltin.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-nobuiltin.ll
@@ -1,7 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=MASSV -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-n32:64"
+target datalayout = "e-m:e-i64:64-n32:64" 
 target triple = "powerpc64le-unknown-linux-gnu"
 
 declare double @atanh(double) #1
@@ -10,20 +9,8 @@ declare float @atanhf(float) #1
 ; Check that functions marked as nobuiltin are not lowered to massv entries.
 define void @atanh_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @atanh_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @atanh(double [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __atanhd2{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -45,20 +32,8 @@ for.end:
 
 define void @atanh_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @atanh_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @atanhf(float [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __atanhf4{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-unsupported.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-unsupported.ll
index 656b2a8d9da0c..e923a06060cd4 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-unsupported.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-unsupported.ll
@@ -1,7 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=MASSV -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-n32:64"
+target datalayout = "e-m:e-i64:64-n32:64" 
 target triple = "powerpc64le-unknown-linux-gnu"
 
 declare double @ceil(double) #0
@@ -13,20 +12,10 @@ declare float @llvm.sqrt.f32(float) #0
 ; Vector counterpart of ceil is unsupported in MASSV library.
 define void @ceil_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @ceil_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @ceil(double [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __ceild2_massv{{.*}}<2 x double>
+; CHECK-NOT: __ceild2_P8{{.*}}<2 x double>
+; CHECK-NOT: __ceild2{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -49,20 +38,10 @@ for.end:
 ; Vector counterpart of fabs is unsupported in MASSV library.
 define void @fabs_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @fabs_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @fabsf(float [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-NOT: __fabsf4_massv{{.*}}<4 x float>
+; CHECK-NOT: __fabsf4_P8{{.*}}<4 x float>
+; CHECK-NOT: __fabsf4{{.*}}<4 x float>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -86,42 +65,8 @@ for.end:
 ; They are not lowered to MASSV entries.
 define void @sqrt_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @sqrt_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK: llvm.sqrt.v2f64{{.*}}<2 x double>
+; CHECK: ret void
 ;
 entry:
   br label %for.body
@@ -143,42 +88,8 @@ for.end:
 
 define void @sqrt_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @sqrt_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK: llvm.sqrt.v4f32{{.*}}<4 x float>
+; CHECK: ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index 4f250d2ce5ea2..5084720f7d5c8 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK
 ; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s --check-prefix VF-FOUR-CHECK
 
@@ -194,7 +193,7 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP95]], <4 x float>* [[TMP131]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48
 ; VF-TWO-CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -208,27 +207,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; VF-TWO-CHECK-NEXT:    [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT:    [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[INDEX27]], 0
 ; VF-TWO-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
 ; VF-TWO-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
 ; VF-TWO-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]]
 ; VF-TWO-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
 ; VF-TWO-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>*
 ; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -452,27 +451,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
 ; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF-FOUR-CHECK-NEXT:    [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[INDEX27]], 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP136:%.*]] = bitcast float* [[TMP135]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP139:%.*]] = bitcast float* [[TMP138]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP143:%.*]] = bitcast float* [[TMP142]] to <4 x float>*
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP140]], <4 x float>* [[TMP143]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; VF-FOUR-CHECK:       vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
 ; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -541,12 +540,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-TWO-CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
 ; VF-TWO-CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
 ; VF-TWO-CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vector.main.loop.iter.check:
 ; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -557,142 +556,142 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-TWO-CHECK:       vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8
-; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12
-; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16
-; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20
-; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24
-; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28
-; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 4
-; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 8
-; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 12
-; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 16
-; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 20
-; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 24
-; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 28
-; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP8]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP25:%.*]] = xor i32 [[TMP9]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = xor i32 [[TMP10]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP27:%.*]] = xor i32 [[TMP11]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP12]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP13]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP14]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP15]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP24]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP25]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP26]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP27]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-TWO-CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-TWO-CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 4
+; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 8
+; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 12
+; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 20
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 24
+; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 28
+; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP20]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP21]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP22]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP23]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[TMP24]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = xor i32 [[TMP25]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP26]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = xor i32 [[TMP27]], -1
 ; VF-TWO-CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP28]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP29]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP30]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP31]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP32]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP41:%.*]] = sext i32 [[TMP33]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP34]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP43:%.*]] = sext i32 [[TMP35]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP36]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]]
-; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]]
-; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]]
-; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]]
-; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]]
+; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]]
 ; VF-TWO-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]]
 ; VF-TWO-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]]
 ; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]]
-; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
+; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
+; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
+; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
+; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4
-; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4
+; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8
-; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8
+; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12
-; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12
+; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16
-; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16
+; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20
-; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20
+; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24
-; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24
+; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28
-; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28
+; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
-; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
-; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
-; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
-; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
-; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4
-; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]]
+; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
+; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
+; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4
 ; VF-TWO-CHECK-NEXT:    [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8
 ; VF-TWO-CHECK-NEXT:    [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12
 ; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16
 ; VF-TWO-CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20
 ; VF-TWO-CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28
+; VF-TWO-CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.iter.check:
-; VF-TWO-CHECK-NEXT:    [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-TWO-CHECK-NEXT:    [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-TWO-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -703,43 +702,43 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-TWO-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
-; VF-TWO-CHECK-NEXT:    [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP115:%.*]] = xor i32 [[TMP113]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]]
-; VF-TWO-CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -1
-; VF-TWO-CHECK-NEXT:    [[TMP121:%.*]] = bitcast float* [[TMP120]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP121]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]]
-; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0
+; VF-TWO-CHECK-NEXT:    [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32
+; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP117:%.*]] = add i64 [[INDEX18]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP119:%.*]] = xor i32 [[TMP118]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP119]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP121:%.*]] = sext i32 [[TMP120]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]]
+; VF-TWO-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -1
 ; VF-TWO-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP122]], <2 x float>* [[TMP125]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP125]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]]
+; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <2 x float>*
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP126]], <2 x float>* [[TMP129]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-TWO-CHECK:       for.body:
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP127:%.*]] = xor i32 [[I_014]], -1
-; VF-TWO-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[I_014]], -1
+; VF-TWO-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP131]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; VF-TWO-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
+; VF-TWO-CHECK-NEXT:    [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; VF-TWO-CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -768,12 +767,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-FOUR-CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
 ; VF-FOUR-CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
 ; VF-FOUR-CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; VF-FOUR-CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; VF-FOUR-CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
-; VF-FOUR-CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; VF-FOUR-CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; VF-FOUR-CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-FOUR-CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-FOUR-CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; VF-FOUR-CHECK:       vector.main.loop.iter.check:
 ; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -784,142 +783,142 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-FOUR-CHECK:       vector.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8
-; VF-FOUR-CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12
-; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16
-; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20
-; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24
-; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28
-; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 8
-; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 12
-; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 16
-; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 20
-; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 24
-; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 28
-; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP8]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP25:%.*]] = xor i32 [[TMP9]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = xor i32 [[TMP10]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP27:%.*]] = xor i32 [[TMP11]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP12]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP13]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP14]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP15]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP24]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP25]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP26]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP27]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-FOUR-CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-FOUR-CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 8
+; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 12
+; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 20
+; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 24
+; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 28
+; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP20]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP21]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP22]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP23]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[TMP24]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = xor i32 [[TMP25]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP26]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = xor i32 [[TMP27]], -1
 ; VF-FOUR-CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP28]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP29]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP30]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP31]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP32]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP41:%.*]] = sext i32 [[TMP33]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP34]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP43:%.*]] = sext i32 [[TMP35]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP36]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]]
-; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]]
-; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]]
-; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]]
-; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]]
+; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]]
-; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
+; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
+; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
+; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
+; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4
-; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4
+; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8
-; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8
+; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12
-; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12
+; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16
-; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16
+; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20
-; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20
+; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24
-; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24
+; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28
-; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28
+; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
-; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
-; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
-; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
-; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
-; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8
+; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]]
+; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
+; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
+; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8
 ; VF-FOUR-CHECK-NEXT:    [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12
 ; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16
 ; VF-FOUR-CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24
+; VF-FOUR-CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28
+; VF-FOUR-CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]]
 ; VF-FOUR-CHECK:       middle.block:
 ; VF-FOUR-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.iter.check:
-; VF-FOUR-CHECK-NEXT:    [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-FOUR-CHECK-NEXT:    [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -930,49 +929,49 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-FOUR-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
 ; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
-; VF-FOUR-CHECK-NEXT:    [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP115:%.*]] = xor i32 [[TMP113]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]]
-; VF-FOUR-CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP121]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]]
-; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32
+; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP117:%.*]] = add i64 [[INDEX18]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP119:%.*]] = xor i32 [[TMP118]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP119]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP121:%.*]] = sext i32 [[TMP120]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]]
+; VF-FOUR-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -3
 ; VF-FOUR-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP122]], <4 x float>* [[TMP125]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP125]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]]
+; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP126]], <4 x float>* [[TMP129]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]]
 ; VF-FOUR-CHECK:       vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT:    [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
 ; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       for.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[TMP127:%.*]] = xor i32 [[I_014]], -1
-; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[I_014]], -1
+; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP131]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
+; VF-FOUR-CHECK-NEXT:    [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; VF-FOUR-CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-FOUR-CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
 ; VF-FOUR-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]]
 ; VF-FOUR-CHECK:       for.end.loopexit.loopexit:
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
 ; VF-FOUR-CHECK:       for.end.loopexit:
@@ -1012,6 +1011,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret i32 0
 }
 
+; VF-TWO-CHECK-DAG: [[LOOPID_MV]] = distinct !{[[LOOPID_MV]], [[LOOPID_DISABLE_VECT:!.*]]}
+; VF-TWO-CHECK-DAG: [[LOOPID_EV]] = distinct !{[[LOOPID_EV]], [[LOOPID_DISABLE_VECT]], [[LOOPID_DISABLE_UNROLL:!.*]]}
+; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_VECT]] = [[DISABLE_VECT_STR:!{!"llvm.loop.isvectorized".*}.*]]
+; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_UNROLL]] = [[DISABLE_UNROLL_STR:!{!"llvm.loop.unroll.runtime.disable"}.*]]
 ;
+; VF-FOUR-CHECK-DAG: [[LOOPID_MV_CM]] = distinct !{[[LOOPID_MV_CM]], [[LOOPID_DISABLE_VECT_CM:!.*]]}
+; VF-FOUR-CHECK-DAG: [[LOOPID_EV_CM]] = distinct !{[[LOOPID_EV_CM]], [[LOOPID_DISABLE_VECT_CM]], [[LOOPID_DISABLE_UNROLL_CM:!.*]]}
+; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_VECT_CM]] = [[DISABLE_VECT_STR_CM:!{!"llvm.loop.isvectorized".*}.*]]
+; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_UNROLL_CM]] = [[DISABLE_UNROLL_STR_CM:!{!"llvm.loop.unroll.runtime.disable"}.*]]
 ;
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="true" "use-soft-float"="false" }

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
index 27e42a0bff0ef..d3cdabd26f503 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
@@ -1,67 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
 
 target triple = "powerpc64-unknown-linux-gnu"
 
 define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR1:%.*]] = ptrtoint i8* [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[PTR1]], [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[PTR1]], 1
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[UMAX]], [[PTR1]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], <i8 -64, i8 -64, i8 -64, i8 -64>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i1> [[TMP6]] to <4 x i32>
-; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[TMP7]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[PTR_ADDR_08]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 [[TMP11]], -64
-; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP1]] to i32
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[COND]], [[COUNT_09]]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PTR_ADDR_08]], i64 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[COUNT_0_LCSSA]]
-;
 entry:
   %idx.ext = sext i32 %l to i64
   %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
@@ -90,70 +31,14 @@ while.end:                                        ; preds = %while.end.loopexit,
   %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
   ret i32 %count.0.lcssa
 
+; CHECK: load <4 x i8>
+; CHECK: icmp slt <4 x i8>
 }
 
 
 define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR1:%.*]] = ptrtoint i8* [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[PTR1]], [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[PTR1]], 1
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[UMAX]], [[PTR1]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i8> [[WIDE_LOAD]], <i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16>
-; CHECK-NEXT:    [[TMP8]] = add <8 x i16> [[TMP7]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP8]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[COUNT_09:%.*]] = phi i16 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[PTR_ADDR_08]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 [[TMP11]], -64
-; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP1]] to i16
-; CHECK-NEXT:    [[ADD]] = add nsw i16 [[COND]], [[COUNT_09]]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PTR_ADDR_08]], i64 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i16 [[COUNT_0_LCSSA]]
-;
 entry:
-  %idx.ext = sext i32 %l to i64
+  %idx.ext = sext i32 %l to i64 
   %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
   %cmp7 = icmp sgt i32 %l, 0
   br i1 %cmp7, label %while.body.preheader, label %while.end
@@ -165,8 +50,8 @@ while.body:                                       ; preds = %while.body.preheade
   %count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ]
   %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
   %0 = load i8, i8* %ptr.addr.08, align 1
-  %cmp1 = icmp slt i8 %0, -64
-  %cond = zext i1 %cmp1 to i16
+  %cmp1 = icmp slt i8 %0, -64 
+  %cond = zext i1 %cmp1 to i16 
   %add = add nsw i16 %cond, %count.09
   %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
   %cmp = icmp ult i8* %incdec.ptr, %add.ptr
@@ -180,73 +65,14 @@ while.end:                                        ; preds = %while.end.loopexit,
   %count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
   ret i16 %count.0.lcssa
 
+; CHECK-LABEL: foo2
+; CHECK: load <8 x i8>
+; CHECK: icmp slt <8 x i8>
 }
 
 define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR1:%.*]] = ptrtoint i16* [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[PTR1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[PTR1]], 2
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[UMAX]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[PTR1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i16> [[WIDE_LOAD]], <i16 -64, i16 -64, i16 -64, i16 -64>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12]] = add <4 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_ADDR_16:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[PTR_ADDR_16]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i16 [[TMP15]], -64
-; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP1]] to i32
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[COND]], [[COUNT_09]]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PTR_ADDR_16]], i64 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[COUNT_0_LCSSA]]
-;
 entry:
-  %idx.ext = sext i32 %l to i64
+  %idx.ext = sext i32 %l to i64 
   %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
   %cmp7 = icmp sgt i32 %l, 0
   br i1 %cmp7, label %while.body.preheader, label %while.end
@@ -258,8 +84,8 @@ while.body:                                       ; preds = %while.body.preheade
   %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
   %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
   %0 = load i16, i16* %ptr.addr.16, align 1
-  %cmp1 = icmp slt i16 %0, -64
-  %cond = zext i1 %cmp1 to i32
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i32 
   %add = add nsw i32 %cond, %count.09
   %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
   %cmp = icmp ult i16* %incdec.ptr, %add.ptr
@@ -273,73 +99,14 @@ while.end:                                        ; preds = %while.end.loopexit,
   %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
   ret i32 %count.0.lcssa
 
+; CHECK-LABEL: foo3
+; CHECK: load <4 x i16>
+; CHECK: icmp slt <4 x i16>
 }
 
 define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR1:%.*]] = ptrtoint i16* [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT:    br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[PTR1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[PTR1]], 2
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[UMAX]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[PTR1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <2 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt <2 x i16> [[WIDE_LOAD]], <i16 -64, i16 -64>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i1> [[TMP10]] to <2 x i64>
-; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP12]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[COUNT_09:%.*]] = phi i64 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_ADDR_16:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[PTR_ADDR_16]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i16 [[TMP15]], -64
-; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP1]] to i64
-; CHECK-NEXT:    [[ADD]] = add nsw i64 [[COND]], [[COUNT_09]]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PTR_ADDR_16]], i64 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i64 [[COUNT_0_LCSSA]]
-;
 entry:
-  %idx.ext = sext i32 %l to i64
+  %idx.ext = sext i32 %l to i64 
   %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
   %cmp7 = icmp sgt i32 %l, 0
   br i1 %cmp7, label %while.body.preheader, label %while.end
@@ -351,8 +118,8 @@ while.body:                                       ; preds = %while.body.preheade
   %count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
   %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
   %0 = load i16, i16* %ptr.addr.16, align 1
-  %cmp1 = icmp slt i16 %0, -64
-  %cond = zext i1 %cmp1 to i64
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i64 
   %add = add nsw i64 %cond, %count.09
   %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
   %cmp = icmp ult i16* %incdec.ptr, %add.ptr
@@ -366,5 +133,8 @@ while.end:                                        ; preds = %while.end.loopexit,
   %count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
   ret i64 %count.0.lcssa
 
+; CHECK-LABEL: foo4
+; CHECK: load <2 x i16>
+; CHECK: icmp slt <2 x i16>
 }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
index 0daf184e0bbad..e3285e67af2b7 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -debug-only=loop-vectorize -passes='function(loop-vectorize),default<O2>' -vectorizer-maximize-bandwidth -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8
 ; RUN: opt < %s -debug-only=loop-vectorize -passes='function(loop-vectorize),default<O2>' -vectorizer-maximize-bandwidth -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9
 ; REQUIRES: asserts
@@ -7,186 +6,11 @@
 @b = global [1024 x i8] zeroinitializer, align 16
 
 define i32 @foo() {
-; CHECK-PWR8-LABEL: @foo(
-; CHECK-PWR8-NEXT:  iter.check:
-; CHECK-PWR8-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR8:       vector.body:
-; CHECK-PWR8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP32:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDEX]]
-; CHECK-PWR8-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 16
-; CHECK-PWR8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
-; CHECK-PWR8-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 16
-; CHECK-PWR8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 32
-; CHECK-PWR8-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 16
-; CHECK-PWR8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 48
-; CHECK-PWR8-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 16
-; CHECK-PWR8-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDEX]]
-; CHECK-PWR8-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP13]], align 16
-; CHECK-PWR8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i64 16
-; CHECK-PWR8-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP15]], align 16
-; CHECK-PWR8-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i64 32
-; CHECK-PWR8-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP17]], align 16
-; CHECK-PWR8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i64 48
-; CHECK-PWR8-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <16 x i8>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, <16 x i8>* [[TMP19]], align 16
-; CHECK-PWR8-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-PWR8-NEXT:    [[TMP24:%.*]] = sub nsw <16 x i32> [[TMP8]], [[TMP20]]
-; CHECK-PWR8-NEXT:    [[TMP25:%.*]] = sub nsw <16 x i32> [[TMP9]], [[TMP21]]
-; CHECK-PWR8-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP10]], [[TMP22]]
-; CHECK-PWR8-NEXT:    [[TMP27:%.*]] = sub nsw <16 x i32> [[TMP11]], [[TMP23]]
-; CHECK-PWR8-NEXT:    [[TMP28:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP24]], i1 true)
-; CHECK-PWR8-NEXT:    [[TMP29:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP25]], i1 true)
-; CHECK-PWR8-NEXT:    [[TMP30:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP26]], i1 true)
-; CHECK-PWR8-NEXT:    [[TMP31:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP27]], i1 true)
-; CHECK-PWR8-NEXT:    [[TMP32]] = add <16 x i32> [[TMP28]], [[VEC_PHI]]
-; CHECK-PWR8-NEXT:    [[TMP33]] = add <16 x i32> [[TMP29]], [[VEC_PHI1]]
-; CHECK-PWR8-NEXT:    [[TMP34]] = add <16 x i32> [[TMP30]], [[VEC_PHI2]]
-; CHECK-PWR8-NEXT:    [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI3]]
-; CHECK-PWR8-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 64
-; CHECK-PWR8-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-PWR8-NEXT:    br i1 [[TMP36]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-PWR8:       for.cond.cleanup:
-; CHECK-PWR8-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP33]], [[TMP32]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[BIN_RDX]], [[TMP34]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX12:%.*]] = add <16 x i32> [[BIN_RDX11]], [[TMP35]]
-; CHECK-PWR8-NEXT:    [[TMP37:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
-; CHECK-PWR8-NEXT:    ret i32 [[TMP37]]
-;
-; CHECK-PWR9-LABEL: @foo(
-; CHECK-PWR9-NEXT:  entry:
-; CHECK-PWR9-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR9:       vector.body:
-; CHECK-PWR9-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP66:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP67:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI4:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI5:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI6:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI7:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP71:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDEX]]
-; CHECK-PWR9-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 16
-; CHECK-PWR9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
-; CHECK-PWR9-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 8
-; CHECK-PWR9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
-; CHECK-PWR9-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 16
-; CHECK-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
-; CHECK-PWR9-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 8
-; CHECK-PWR9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 32
-; CHECK-PWR9-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 16
-; CHECK-PWR9-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 40
-; CHECK-PWR9-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 8
-; CHECK-PWR9-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 48
-; CHECK-PWR9-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 16
-; CHECK-PWR9-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 56
-; CHECK-PWR9-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 8
-; CHECK-PWR9-NEXT:    [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP18:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP19:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP20:%.*]] = zext <8 x i8> [[WIDE_LOAD11]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP21:%.*]] = zext <8 x i8> [[WIDE_LOAD12]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD13]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP23:%.*]] = zext <8 x i8> [[WIDE_LOAD14]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDEX]]
-; CHECK-PWR9-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD15:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 16
-; CHECK-PWR9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 8
-; CHECK-PWR9-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x i8>, <8 x i8>* [[TMP27]], align 8
-; CHECK-PWR9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 16
-; CHECK-PWR9-NEXT:    [[TMP29:%.*]] = bitcast i8* [[TMP28]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i8>, <8 x i8>* [[TMP29]], align 16
-; CHECK-PWR9-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 24
-; CHECK-PWR9-NEXT:    [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x i8>, <8 x i8>* [[TMP31]], align 8
-; CHECK-PWR9-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 32
-; CHECK-PWR9-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP32]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP33]], align 16
-; CHECK-PWR9-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 40
-; CHECK-PWR9-NEXT:    [[TMP35:%.*]] = bitcast i8* [[TMP34]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP35]], align 8
-; CHECK-PWR9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 48
-; CHECK-PWR9-NEXT:    [[TMP37:%.*]] = bitcast i8* [[TMP36]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i8>, <8 x i8>* [[TMP37]], align 16
-; CHECK-PWR9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 56
-; CHECK-PWR9-NEXT:    [[TMP39:%.*]] = bitcast i8* [[TMP38]] to <8 x i8>*
-; CHECK-PWR9-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP39]], align 8
-; CHECK-PWR9-NEXT:    [[TMP40:%.*]] = zext <8 x i8> [[WIDE_LOAD15]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP41:%.*]] = zext <8 x i8> [[WIDE_LOAD16]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP42:%.*]] = zext <8 x i8> [[WIDE_LOAD17]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP43:%.*]] = zext <8 x i8> [[WIDE_LOAD18]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP44:%.*]] = zext <8 x i8> [[WIDE_LOAD19]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP45:%.*]] = zext <8 x i8> [[WIDE_LOAD20]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP46:%.*]] = zext <8 x i8> [[WIDE_LOAD21]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP47:%.*]] = zext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
-; CHECK-PWR9-NEXT:    [[TMP48:%.*]] = sub nsw <8 x i32> [[TMP16]], [[TMP40]]
-; CHECK-PWR9-NEXT:    [[TMP49:%.*]] = sub nsw <8 x i32> [[TMP17]], [[TMP41]]
-; CHECK-PWR9-NEXT:    [[TMP50:%.*]] = sub nsw <8 x i32> [[TMP18]], [[TMP42]]
-; CHECK-PWR9-NEXT:    [[TMP51:%.*]] = sub nsw <8 x i32> [[TMP19]], [[TMP43]]
-; CHECK-PWR9-NEXT:    [[TMP52:%.*]] = sub nsw <8 x i32> [[TMP20]], [[TMP44]]
-; CHECK-PWR9-NEXT:    [[TMP53:%.*]] = sub nsw <8 x i32> [[TMP21]], [[TMP45]]
-; CHECK-PWR9-NEXT:    [[TMP54:%.*]] = sub nsw <8 x i32> [[TMP22]], [[TMP46]]
-; CHECK-PWR9-NEXT:    [[TMP55:%.*]] = sub nsw <8 x i32> [[TMP23]], [[TMP47]]
-; CHECK-PWR9-NEXT:    [[TMP56:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP48]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP57:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP49]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP58:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP50]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP59:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP51]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP60:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP52]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP61:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP53]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP62:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP54]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP63:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP55]], i1 true)
-; CHECK-PWR9-NEXT:    [[TMP64]] = add <8 x i32> [[TMP56]], [[VEC_PHI]]
-; CHECK-PWR9-NEXT:    [[TMP65]] = add <8 x i32> [[TMP57]], [[VEC_PHI1]]
-; CHECK-PWR9-NEXT:    [[TMP66]] = add <8 x i32> [[TMP58]], [[VEC_PHI2]]
-; CHECK-PWR9-NEXT:    [[TMP67]] = add <8 x i32> [[TMP59]], [[VEC_PHI3]]
-; CHECK-PWR9-NEXT:    [[TMP68]] = add <8 x i32> [[TMP60]], [[VEC_PHI4]]
-; CHECK-PWR9-NEXT:    [[TMP69]] = add <8 x i32> [[TMP61]], [[VEC_PHI5]]
-; CHECK-PWR9-NEXT:    [[TMP70]] = add <8 x i32> [[TMP62]], [[VEC_PHI6]]
-; CHECK-PWR9-NEXT:    [[TMP71]] = add <8 x i32> [[TMP63]], [[VEC_PHI7]]
-; CHECK-PWR9-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 64
-; CHECK-PWR9-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-PWR9-NEXT:    br i1 [[TMP72]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-PWR9:       for.cond.cleanup:
-; CHECK-PWR9-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP65]], [[TMP64]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX23:%.*]] = add <8 x i32> [[BIN_RDX]], [[TMP66]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX24:%.*]] = add <8 x i32> [[BIN_RDX23]], [[TMP67]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX25:%.*]] = add <8 x i32> [[BIN_RDX24]], [[TMP68]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX26:%.*]] = add <8 x i32> [[BIN_RDX25]], [[TMP69]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX27:%.*]] = add <8 x i32> [[BIN_RDX26]], [[TMP70]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX28:%.*]] = add <8 x i32> [[BIN_RDX27]], [[TMP71]]
-; CHECK-PWR9-NEXT:    [[TMP73:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX28]])
-; CHECK-PWR9-NEXT:    ret i32 [[TMP73]]
-;
+; CHECK-LABEL: foo
 
+; CHECK-PWR8: Executing best plan with VF=16, UF=4
 
+; CHECK-PWR9: Executing best plan with VF=8, UF=8
 
 
 entry:
@@ -219,72 +43,10 @@ define i32 @goo() {
 ; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
 ; it will not have vector version and the vector register usage will not exceed the
 ; available vector register number.
-; CHECK-LABEL: @goo(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 16
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 32
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 48
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP15]], align 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i64 16
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP17]], align 2
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i64 32
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i64 48
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, <16 x i8>* [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = zext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP9]], [[TMP22]]
-; CHECK-NEXT:    [[TMP27:%.*]] = sub nsw <16 x i32> [[TMP10]], [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP11]], [[TMP24]]
-; CHECK-NEXT:    [[TMP29:%.*]] = sub nsw <16 x i32> [[TMP12]], [[TMP25]]
-; CHECK-NEXT:    [[TMP30:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP26]], i1 true)
-; CHECK-NEXT:    [[TMP31:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP27]], i1 true)
-; CHECK-NEXT:    [[TMP32:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP28]], i1 true)
-; CHECK-NEXT:    [[TMP33:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP29]], i1 true)
-; CHECK-NEXT:    [[TMP34]] = add <16 x i32> [[TMP30]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP36]] = add <16 x i32> [[TMP32]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP37]] = add <16 x i32> [[TMP33]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 64
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP38]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP35]], [[TMP34]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[BIN_RDX]], [[TMP36]]
-; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add <16 x i32> [[BIN_RDX11]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
-; CHECK-NEXT:    ret i32 [[TMP39]]
-;
 
+; CHECK-LABEL: goo
+
+; CHECK: Executing best plan with VF=16, UF=4
 
 entry:
   br label %for.body
@@ -315,207 +77,9 @@ for.body:                                         ; preds = %for.body, %entry
 }
 
 define i64 @bar(i64* nocapture %a) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI14:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI22:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <2 x i64> [[VEC_IND]], <i64 4, i64 4>
-; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <2 x i64> [[VEC_IND]], <i64 6, i64 6>
-; CHECK-NEXT:    [[STEP_ADD3:%.*]] = add <2 x i64> [[VEC_IND]], <i64 8, i64 8>
-; CHECK-NEXT:    [[STEP_ADD4:%.*]] = add <2 x i64> [[VEC_IND]], <i64 10, i64 10>
-; CHECK-NEXT:    [[STEP_ADD5:%.*]] = add <2 x i64> [[VEC_IND]], <i64 12, i64 12>
-; CHECK-NEXT:    [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 14, i64 14>
-; CHECK-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 16, i64 16>
-; CHECK-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 18, i64 18>
-; CHECK-NEXT:    [[STEP_ADD9:%.*]] = add <2 x i64> [[VEC_IND]], <i64 20, i64 20>
-; CHECK-NEXT:    [[STEP_ADD10:%.*]] = add <2 x i64> [[VEC_IND]], <i64 22, i64 22>
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 6
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 10
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 14
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 16
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 18
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64* [[TMP18]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 20
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i64* [[TMP20]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, <2 x i64>* [[TMP21]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 22
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i64* [[TMP22]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, <2 x i64>* [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <2 x i64> [[STEP_ADD]], [[WIDE_LOAD23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], [[WIDE_LOAD24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], [[WIDE_LOAD25]]
-; CHECK-NEXT:    [[TMP28:%.*]] = add nsw <2 x i64> [[STEP_ADD3]], [[WIDE_LOAD26]]
-; CHECK-NEXT:    [[TMP29:%.*]] = add nsw <2 x i64> [[STEP_ADD4]], [[WIDE_LOAD27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = add nsw <2 x i64> [[STEP_ADD5]], [[WIDE_LOAD28]]
-; CHECK-NEXT:    [[TMP31:%.*]] = add nsw <2 x i64> [[STEP_ADD6]], [[WIDE_LOAD29]]
-; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <2 x i64> [[STEP_ADD7]], [[WIDE_LOAD30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = add nsw <2 x i64> [[STEP_ADD8]], [[WIDE_LOAD31]]
-; CHECK-NEXT:    [[TMP34:%.*]] = add nsw <2 x i64> [[STEP_ADD9]], [[WIDE_LOAD32]]
-; CHECK-NEXT:    [[TMP35:%.*]] = add nsw <2 x i64> [[STEP_ADD10]], [[WIDE_LOAD33]]
-; CHECK-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[TMP9]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP11]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP30]], <2 x i64>* [[TMP13]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP15]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP32]], <2 x i64>* [[TMP17]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP19]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP34]], <2 x i64>* [[TMP21]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP36]] = add <2 x i64> [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP37]] = add <2 x i64> [[TMP25]], [[VEC_PHI12]]
-; CHECK-NEXT:    [[TMP38]] = add <2 x i64> [[TMP26]], [[VEC_PHI13]]
-; CHECK-NEXT:    [[TMP39]] = add <2 x i64> [[TMP27]], [[VEC_PHI14]]
-; CHECK-NEXT:    [[TMP40]] = add <2 x i64> [[TMP28]], [[VEC_PHI15]]
-; CHECK-NEXT:    [[TMP41]] = add <2 x i64> [[TMP29]], [[VEC_PHI16]]
-; CHECK-NEXT:    [[TMP42]] = add <2 x i64> [[TMP30]], [[VEC_PHI17]]
-; CHECK-NEXT:    [[TMP43]] = add <2 x i64> [[TMP31]], [[VEC_PHI18]]
-; CHECK-NEXT:    [[TMP44]] = add <2 x i64> [[TMP32]], [[VEC_PHI19]]
-; CHECK-NEXT:    [[TMP45]] = add <2 x i64> [[TMP33]], [[VEC_PHI20]]
-; CHECK-NEXT:    [[TMP46]] = add <2 x i64> [[TMP34]], [[VEC_PHI21]]
-; CHECK-NEXT:    [[TMP47]] = add <2 x i64> [[TMP35]], [[VEC_PHI22]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 24
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 24, i64 24>
-; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
-; CHECK-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[BIN_RDX34:%.*]] = add <2 x i64> [[BIN_RDX]], [[TMP38]]
-; CHECK-NEXT:    [[BIN_RDX35:%.*]] = add <2 x i64> [[BIN_RDX34]], [[TMP39]]
-; CHECK-NEXT:    [[BIN_RDX36:%.*]] = add <2 x i64> [[BIN_RDX35]], [[TMP40]]
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <2 x i64> [[BIN_RDX36]], [[TMP41]]
-; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <2 x i64> [[BIN_RDX37]], [[TMP42]]
-; CHECK-NEXT:    [[BIN_RDX39:%.*]] = add <2 x i64> [[BIN_RDX38]], [[TMP43]]
-; CHECK-NEXT:    [[BIN_RDX40:%.*]] = add <2 x i64> [[BIN_RDX39]], [[TMP44]]
-; CHECK-NEXT:    [[BIN_RDX41:%.*]] = add <2 x i64> [[BIN_RDX40]], [[TMP45]]
-; CHECK-NEXT:    [[BIN_RDX42:%.*]] = add <2 x i64> [[BIN_RDX41]], [[TMP46]]
-; CHECK-NEXT:    [[BIN_RDX43:%.*]] = add <2 x i64> [[BIN_RDX42]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX43]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1008
-; CHECK-NEXT:    [[TMP50:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP50]], 1008
-; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i64 [[ADD]], [[TMP49]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1009
-; CHECK-NEXT:    [[TMP51:%.*]] = load i64, i64* [[ARRAYIDX_1]], align 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i64 [[TMP51]], 1009
-; CHECK-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX_1]], align 8
-; CHECK-NEXT:    [[ADD2_1:%.*]] = add nsw i64 [[ADD_1]], [[ADD2]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1010
-; CHECK-NEXT:    [[TMP52:%.*]] = load i64, i64* [[ARRAYIDX_2]], align 8
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i64 [[TMP52]], 1010
-; CHECK-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX_2]], align 8
-; CHECK-NEXT:    [[ADD2_2:%.*]] = add nsw i64 [[ADD_2]], [[ADD2_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1011
-; CHECK-NEXT:    [[TMP53:%.*]] = load i64, i64* [[ARRAYIDX_3]], align 8
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i64 [[TMP53]], 1011
-; CHECK-NEXT:    store i64 [[ADD_3]], i64* [[ARRAYIDX_3]], align 8
-; CHECK-NEXT:    [[ADD2_3:%.*]] = add nsw i64 [[ADD_3]], [[ADD2_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1012
-; CHECK-NEXT:    [[TMP54:%.*]] = load i64, i64* [[ARRAYIDX_4]], align 8
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i64 [[TMP54]], 1012
-; CHECK-NEXT:    store i64 [[ADD_4]], i64* [[ARRAYIDX_4]], align 8
-; CHECK-NEXT:    [[ADD2_4:%.*]] = add nsw i64 [[ADD_4]], [[ADD2_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1013
-; CHECK-NEXT:    [[TMP55:%.*]] = load i64, i64* [[ARRAYIDX_5]], align 8
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i64 [[TMP55]], 1013
-; CHECK-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX_5]], align 8
-; CHECK-NEXT:    [[ADD2_5:%.*]] = add nsw i64 [[ADD_5]], [[ADD2_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1014
-; CHECK-NEXT:    [[TMP56:%.*]] = load i64, i64* [[ARRAYIDX_6]], align 8
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i64 [[TMP56]], 1014
-; CHECK-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX_6]], align 8
-; CHECK-NEXT:    [[ADD2_6:%.*]] = add nsw i64 [[ADD_6]], [[ADD2_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1015
-; CHECK-NEXT:    [[TMP57:%.*]] = load i64, i64* [[ARRAYIDX_7]], align 8
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i64 [[TMP57]], 1015
-; CHECK-NEXT:    store i64 [[ADD_7]], i64* [[ARRAYIDX_7]], align 8
-; CHECK-NEXT:    [[ADD2_7:%.*]] = add nsw i64 [[ADD_7]], [[ADD2_6]]
-; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1016
-; CHECK-NEXT:    [[TMP58:%.*]] = load i64, i64* [[ARRAYIDX_8]], align 8
-; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i64 [[TMP58]], 1016
-; CHECK-NEXT:    store i64 [[ADD_8]], i64* [[ARRAYIDX_8]], align 8
-; CHECK-NEXT:    [[ADD2_8:%.*]] = add nsw i64 [[ADD_8]], [[ADD2_7]]
-; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1017
-; CHECK-NEXT:    [[TMP59:%.*]] = load i64, i64* [[ARRAYIDX_9]], align 8
-; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i64 [[TMP59]], 1017
-; CHECK-NEXT:    store i64 [[ADD_9]], i64* [[ARRAYIDX_9]], align 8
-; CHECK-NEXT:    [[ADD2_9:%.*]] = add nsw i64 [[ADD_9]], [[ADD2_8]]
-; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1018
-; CHECK-NEXT:    [[TMP60:%.*]] = load i64, i64* [[ARRAYIDX_10]], align 8
-; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i64 [[TMP60]], 1018
-; CHECK-NEXT:    store i64 [[ADD_10]], i64* [[ARRAYIDX_10]], align 8
-; CHECK-NEXT:    [[ADD2_10:%.*]] = add nsw i64 [[ADD_10]], [[ADD2_9]]
-; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1019
-; CHECK-NEXT:    [[TMP61:%.*]] = load i64, i64* [[ARRAYIDX_11]], align 8
-; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i64 [[TMP61]], 1019
-; CHECK-NEXT:    store i64 [[ADD_11]], i64* [[ARRAYIDX_11]], align 8
-; CHECK-NEXT:    [[ADD2_11:%.*]] = add nsw i64 [[ADD_11]], [[ADD2_10]]
-; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1020
-; CHECK-NEXT:    [[TMP62:%.*]] = load i64, i64* [[ARRAYIDX_12]], align 8
-; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i64 [[TMP62]], 1020
-; CHECK-NEXT:    store i64 [[ADD_12]], i64* [[ARRAYIDX_12]], align 8
-; CHECK-NEXT:    [[ADD2_12:%.*]] = add nsw i64 [[ADD_12]], [[ADD2_11]]
-; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1021
-; CHECK-NEXT:    [[TMP63:%.*]] = load i64, i64* [[ARRAYIDX_13]], align 8
-; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i64 [[TMP63]], 1021
-; CHECK-NEXT:    store i64 [[ADD_13]], i64* [[ARRAYIDX_13]], align 8
-; CHECK-NEXT:    [[ADD2_13:%.*]] = add nsw i64 [[ADD_13]], [[ADD2_12]]
-; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1022
-; CHECK-NEXT:    [[TMP64:%.*]] = load i64, i64* [[ARRAYIDX_14]], align 8
-; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i64 [[TMP64]], 1022
-; CHECK-NEXT:    store i64 [[ADD_14]], i64* [[ARRAYIDX_14]], align 8
-; CHECK-NEXT:    [[ADD2_14:%.*]] = add nsw i64 [[ADD_14]], [[ADD2_13]]
-; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1023
-; CHECK-NEXT:    [[TMP65:%.*]] = load i64, i64* [[ARRAYIDX_15]], align 8
-; CHECK-NEXT:    [[ADD_15:%.*]] = add nsw i64 [[TMP65]], 1023
-; CHECK-NEXT:    store i64 [[ADD_15]], i64* [[ARRAYIDX_15]], align 8
-; CHECK-NEXT:    [[ADD2_15:%.*]] = add nsw i64 [[ADD_15]], [[ADD2_14]]
-; CHECK-NEXT:    ret i64 [[ADD2_15]]
-;
+; CHECK-LABEL: bar
 
+; CHECK: Executing best plan with VF=2, UF=12
 
 entry:
   br label %for.body
@@ -542,116 +106,8 @@ for.body:
 @c = external global [0 x i32], align 4
 
 define void @hoo(i32 %n) {
-; CHECK-LABEL: @hoo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[INDUCTION3:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[INDUCTION4:%.*]] = add nuw nsw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[INDUCTION5:%.*]] = add nuw nsw i64 [[INDEX]], 5
-; CHECK-NEXT:    [[INDUCTION6:%.*]] = add nuw nsw i64 [[INDEX]], 6
-; CHECK-NEXT:    [[INDUCTION7:%.*]] = add nuw nsw i64 [[INDEX]], 7
-; CHECK-NEXT:    [[INDUCTION8:%.*]] = add nuw nsw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[INDUCTION9:%.*]] = add nuw nsw i64 [[INDEX]], 9
-; CHECK-NEXT:    [[INDUCTION10:%.*]] = add nuw nsw i64 [[INDEX]], 10
-; CHECK-NEXT:    [[INDUCTION11:%.*]] = add nuw nsw i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, i64* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, i64* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, i64* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, i64* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, i64* [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, i64* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, i64* [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP37:%.*]] = load i32, i32* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP38:%.*]] = load i32, i32* [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP39:%.*]] = load i32, i32* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* [[TMP30]], align 4
-; CHECK-NEXT:    [[TMP43:%.*]] = load i32, i32* [[TMP31]], align 4
-; CHECK-NEXT:    [[TMP44:%.*]] = load i32, i32* [[TMP32]], align 4
-; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* [[TMP33]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = load i32, i32* [[TMP34]], align 4
-; CHECK-NEXT:    [[TMP47:%.*]] = load i32, i32* [[TMP35]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION1]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION2]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION3]]
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION4]]
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION5]]
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION6]]
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION7]]
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION8]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION9]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION10]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION11]]
-; CHECK-NEXT:    store i32 [[TMP36]], i32* [[TMP48]], align 4
-; CHECK-NEXT:    store i32 [[TMP37]], i32* [[TMP49]], align 4
-; CHECK-NEXT:    store i32 [[TMP38]], i32* [[TMP50]], align 4
-; CHECK-NEXT:    store i32 [[TMP39]], i32* [[TMP51]], align 4
-; CHECK-NEXT:    store i32 [[TMP40]], i32* [[TMP52]], align 4
-; CHECK-NEXT:    store i32 [[TMP41]], i32* [[TMP53]], align 4
-; CHECK-NEXT:    store i32 [[TMP42]], i32* [[TMP54]], align 4
-; CHECK-NEXT:    store i32 [[TMP43]], i32* [[TMP55]], align 4
-; CHECK-NEXT:    store i32 [[TMP44]], i32* [[TMP56]], align 4
-; CHECK-NEXT:    store i32 [[TMP45]], i32* [[TMP57]], align 4
-; CHECK-NEXT:    store i32 [[TMP46]], i32* [[TMP58]], align 4
-; CHECK-NEXT:    store i32 [[TMP47]], i32* [[TMP59]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9996
-; CHECK-NEXT:    br i1 [[TMP60]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9996), align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9996), align 4
-; CHECK-NEXT:    [[TMP_1:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9997), align 8
-; CHECK-NEXT:    [[ARRAYIDX1_1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP_1]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = load i32, i32* [[ARRAYIDX1_1]], align 4
-; CHECK-NEXT:    store i32 [[TMP1_1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9997), align 4
-; CHECK-NEXT:    [[TMP_2:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9998), align 8
-; CHECK-NEXT:    [[ARRAYIDX1_2:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP_2]]
-; CHECK-NEXT:    [[TMP1_2:%.*]] = load i32, i32* [[ARRAYIDX1_2]], align 4
-; CHECK-NEXT:    store i32 [[TMP1_2]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9998), align 4
-; CHECK-NEXT:    [[TMP_3:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9999), align 8
-; CHECK-NEXT:    [[ARRAYIDX1_3:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP_3]]
-; CHECK-NEXT:    [[TMP1_3:%.*]] = load i32, i32* [[ARRAYIDX1_3]], align 4
-; CHECK-NEXT:    store i32 [[TMP1_3]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9999), align 4
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: hoo
+; CHECK: Executing best plan with VF=1, UF=12
 
 entry:
   br label %for.body
@@ -673,159 +129,13 @@ for.end:                                          ; preds = %for.body
 }
 
 define float @float_(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
-; CHECK-LABEL: @float_(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       preheader:
-; CHECK-NEXT:    [[T033:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[T033]], -1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 352
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 5
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_MOD_VF_LHS_TRUNC:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[N_MOD_VF34:%.*]] = urem i32 [[N_MOD_VF_LHS_TRUNC]], 12
-; CHECK-NEXT:    [[N_MOD_VF_ZEXT:%.*]] = zext i32 [[N_MOD_VF34]] to i64
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP2]], [[N_MOD_VF_ZEXT]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP66:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP67:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP71:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP72:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP73:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI11:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP74:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 5
-; CHECK-NEXT:    [[INDUCTION12:%.*]] = or i64 [[OFFSET_IDX]], 32
-; CHECK-NEXT:    [[INDUCTION13:%.*]] = or i64 [[OFFSET_IDX]], 64
-; CHECK-NEXT:    [[INDUCTION14:%.*]] = or i64 [[OFFSET_IDX]], 96
-; CHECK-NEXT:    [[INDUCTION15:%.*]] = add i64 [[OFFSET_IDX]], 128
-; CHECK-NEXT:    [[INDUCTION16:%.*]] = add i64 [[OFFSET_IDX]], 160
-; CHECK-NEXT:    [[INDUCTION17:%.*]] = add i64 [[OFFSET_IDX]], 192
-; CHECK-NEXT:    [[INDUCTION18:%.*]] = add i64 [[OFFSET_IDX]], 224
-; CHECK-NEXT:    [[INDUCTION19:%.*]] = add i64 [[OFFSET_IDX]], 256
-; CHECK-NEXT:    [[INDUCTION20:%.*]] = add i64 [[OFFSET_IDX]], 288
-; CHECK-NEXT:    [[INDUCTION21:%.*]] = add i64 [[OFFSET_IDX]], 320
-; CHECK-NEXT:    [[INDUCTION22:%.*]] = add i64 [[OFFSET_IDX]], 352
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION12]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION13]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION14]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION15]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION16]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION17]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION18]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION19]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION20]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION21]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION22]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION19]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION20]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION21]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION22]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = load float, float* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP30]], align 4
-; CHECK-NEXT:    [[TMP43:%.*]] = load float, float* [[TMP31]], align 4
-; CHECK-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP32]], align 4
-; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[TMP33]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = load float, float* [[TMP34]], align 4
-; CHECK-NEXT:    [[TMP47:%.*]] = load float, float* [[TMP35]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = load float, float* [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP49:%.*]] = load float, float* [[TMP37]], align 4
-; CHECK-NEXT:    [[TMP50:%.*]] = load float, float* [[TMP38]], align 4
-; CHECK-NEXT:    [[TMP51:%.*]] = fadd fast float [[TMP15]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP52:%.*]] = fadd fast float [[TMP16]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP53:%.*]] = fadd fast float [[TMP17]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP54:%.*]] = fadd fast float [[TMP18]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP55:%.*]] = fadd fast float [[TMP19]], [[VEC_PHI4]]
-; CHECK-NEXT:    [[TMP56:%.*]] = fadd fast float [[TMP20]], [[VEC_PHI5]]
-; CHECK-NEXT:    [[TMP57:%.*]] = fadd fast float [[TMP21]], [[VEC_PHI6]]
-; CHECK-NEXT:    [[TMP58:%.*]] = fadd fast float [[TMP22]], [[VEC_PHI7]]
-; CHECK-NEXT:    [[TMP59:%.*]] = fadd fast float [[TMP23]], [[VEC_PHI8]]
-; CHECK-NEXT:    [[TMP60:%.*]] = fadd fast float [[TMP24]], [[VEC_PHI9]]
-; CHECK-NEXT:    [[TMP61:%.*]] = fadd fast float [[TMP25]], [[VEC_PHI10]]
-; CHECK-NEXT:    [[TMP62:%.*]] = fadd fast float [[TMP26]], [[VEC_PHI11]]
-; CHECK-NEXT:    [[TMP63]] = fadd fast float [[TMP51]], [[TMP39]]
-; CHECK-NEXT:    [[TMP64]] = fadd fast float [[TMP52]], [[TMP40]]
-; CHECK-NEXT:    [[TMP65]] = fadd fast float [[TMP53]], [[TMP41]]
-; CHECK-NEXT:    [[TMP66]] = fadd fast float [[TMP54]], [[TMP42]]
-; CHECK-NEXT:    [[TMP67]] = fadd fast float [[TMP55]], [[TMP43]]
-; CHECK-NEXT:    [[TMP68]] = fadd fast float [[TMP56]], [[TMP44]]
-; CHECK-NEXT:    [[TMP69]] = fadd fast float [[TMP57]], [[TMP45]]
-; CHECK-NEXT:    [[TMP70]] = fadd fast float [[TMP58]], [[TMP46]]
-; CHECK-NEXT:    [[TMP71]] = fadd fast float [[TMP59]], [[TMP47]]
-; CHECK-NEXT:    [[TMP72]] = fadd fast float [[TMP60]], [[TMP48]]
-; CHECK-NEXT:    [[TMP73]] = fadd fast float [[TMP61]], [[TMP49]]
-; CHECK-NEXT:    [[TMP74]] = fadd fast float [[TMP62]], [[TMP50]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP75]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 5
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast float [[TMP64]], [[TMP63]]
-; CHECK-NEXT:    [[BIN_RDX23:%.*]] = fadd fast float [[BIN_RDX]], [[TMP65]]
-; CHECK-NEXT:    [[BIN_RDX24:%.*]] = fadd fast float [[BIN_RDX23]], [[TMP66]]
-; CHECK-NEXT:    [[BIN_RDX25:%.*]] = fadd fast float [[BIN_RDX24]], [[TMP67]]
-; CHECK-NEXT:    [[BIN_RDX26:%.*]] = fadd fast float [[BIN_RDX25]], [[TMP68]]
-; CHECK-NEXT:    [[BIN_RDX27:%.*]] = fadd fast float [[BIN_RDX26]], [[TMP69]]
-; CHECK-NEXT:    [[BIN_RDX28:%.*]] = fadd fast float [[BIN_RDX27]], [[TMP70]]
-; CHECK-NEXT:    [[BIN_RDX29:%.*]] = fadd fast float [[BIN_RDX28]], [[TMP71]]
-; CHECK-NEXT:    [[BIN_RDX30:%.*]] = fadd fast float [[BIN_RDX29]], [[TMP72]]
-; CHECK-NEXT:    [[BIN_RDX31:%.*]] = fadd fast float [[BIN_RDX30]], [[TMP73]]
-; CHECK-NEXT:    [[BIN_RDX32:%.*]] = fadd fast float [[BIN_RDX31]], [[TMP74]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF34]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_PREHEADER]]
-; CHECK:       for.preheader:
-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[S_02_PH:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[BIN_RDX32]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ], [ [[INDVARS_IV_PH]], [[FOR_PREHEADER]] ]
-; CHECK-NEXT:    [[S_02:%.*]] = phi float [ [[ADD4:%.*]], [[FOR]] ], [ [[S_02_PH]], [[FOR_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[T1]], [[S_02]]
-; CHECK-NEXT:    [[ADD4]] = fadd fast float [[ADD]], [[T2]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T033]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR]], label [[FOR_END]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[BIN_RDX32]], [[MIDDLE_BLOCK]] ], [ [[ADD4]], [[FOR]] ]
-; CHECK-NEXT:    ret float [[S_0_LCSSA]]
-;
+;CHECK-LABEL: float_
+;CHECK: LV(REG): VF = 1
+;CHECK: LV(REG): Found max usage: 2 item
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 3 registers
+;CHECK: LV(REG): Found invariant usage: 1 item
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
 
 entry:
   %cmp = icmp sgt i32 %n, 0
@@ -859,243 +169,20 @@ for.end:
 
 
 define void @double_(double* nocapture %A, i32 %n) nounwind uwtable ssp {
-; CHECK-PWR8-LABEL: @double_(
-; CHECK-PWR8-NEXT:    [[TMP1:%.*]] = sext i32 [[N:%.*]] to i64
-; CHECK-PWR8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
-; CHECK-PWR8-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-PWR8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-PWR8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-PWR8:       vector.ph:
-; CHECK-PWR8-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 8589934590
-; CHECK-PWR8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 -1
-; CHECK-PWR8-NEXT:    [[TMP5:%.*]] = add nsw i64 [[N_VEC]], -2
-; CHECK-PWR8-NEXT:    [[TMP6:%.*]] = lshr exact i64 [[TMP5]], 1
-; CHECK-PWR8-NEXT:    [[TMP7:%.*]] = add nuw i64 [[TMP6]], 1
-; CHECK-PWR8-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP7]], 1
-; CHECK-PWR8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], 0
-; CHECK-PWR8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
-; CHECK-PWR8:       vector.ph.new:
-; CHECK-PWR8-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP7]], -2
-; CHECK-PWR8-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR8:       vector.body:
-; CHECK-PWR8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[OFFSET_IDX:%.*]] = sub nsw i64 [[TMP1]], [[INDEX]]
-; CHECK-PWR8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 [[OFFSET_IDX]]
-; CHECK-PWR8-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP10]], align 8
-; CHECK-PWR8-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x double> [[WIDE_LOAD]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[REVERSE]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[REVERSE]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP11]], [[TMP12]]
-; CHECK-PWR8-NEXT:    [[TMP14:%.*]] = fadd <2 x double> [[TMP13]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP14]], <double 5.000000e-01, double 5.000000e-01>
-; CHECK-PWR8-NEXT:    [[TMP16:%.*]] = fadd <2 x double> [[TMP12]], [[TMP15]]
-; CHECK-PWR8-NEXT:    [[TMP17:%.*]] = fsub <2 x double> [[TMP16]], [[TMP11]]
-; CHECK-PWR8-NEXT:    [[TMP18:%.*]] = fadd <2 x double> [[REVERSE]], [[TMP17]]
-; CHECK-PWR8-NEXT:    [[TMP19:%.*]] = fdiv <2 x double> [[TMP14]], [[TMP18]]
-; CHECK-PWR8-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP14]], [[TMP19]]
-; CHECK-PWR8-NEXT:    [[TMP21:%.*]] = fmul <2 x double> [[TMP12]], [[TMP20]]
-; CHECK-PWR8-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP11]], [[TMP21]]
-; CHECK-PWR8-NEXT:    [[TMP23:%.*]] = fadd <2 x double> [[TMP22]], <double -3.000000e+00, double -3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP24:%.*]] = fsub <2 x double> [[REVERSE]], [[TMP11]]
-; CHECK-PWR8-NEXT:    [[TMP25:%.*]] = fadd <2 x double> [[TMP12]], [[TMP24]]
-; CHECK-PWR8-NEXT:    [[TMP26:%.*]] = fadd <2 x double> [[TMP25]], [[TMP19]]
-; CHECK-PWR8-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[TMP26]], [[TMP23]]
-; CHECK-PWR8-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[TMP27]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[REVERSE]], [[TMP28]]
-; CHECK-PWR8-NEXT:    [[REVERSE1:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT:    store <2 x double> [[REVERSE1]], <2 x double>* [[TMP10]], align 8
-; CHECK-PWR8-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 2
-; CHECK-PWR8-NEXT:    [[OFFSET_IDX_1:%.*]] = sub nsw i64 [[TMP1]], [[INDEX_NEXT]]
-; CHECK-PWR8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 [[OFFSET_IDX_1]]
-; CHECK-PWR8-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <2 x double>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x double>, <2 x double>* [[TMP31]], align 8
-; CHECK-PWR8-NEXT:    [[REVERSE_1:%.*]] = shufflevector <2 x double> [[WIDE_LOAD_1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT:    [[TMP32:%.*]] = fadd <2 x double> [[REVERSE_1]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP33:%.*]] = fmul <2 x double> [[REVERSE_1]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP34:%.*]] = fadd <2 x double> [[TMP32]], [[TMP33]]
-; CHECK-PWR8-NEXT:    [[TMP35:%.*]] = fadd <2 x double> [[TMP34]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP36:%.*]] = fmul <2 x double> [[TMP35]], <double 5.000000e-01, double 5.000000e-01>
-; CHECK-PWR8-NEXT:    [[TMP37:%.*]] = fadd <2 x double> [[TMP33]], [[TMP36]]
-; CHECK-PWR8-NEXT:    [[TMP38:%.*]] = fsub <2 x double> [[TMP37]], [[TMP32]]
-; CHECK-PWR8-NEXT:    [[TMP39:%.*]] = fadd <2 x double> [[REVERSE_1]], [[TMP38]]
-; CHECK-PWR8-NEXT:    [[TMP40:%.*]] = fdiv <2 x double> [[TMP35]], [[TMP39]]
-; CHECK-PWR8-NEXT:    [[TMP41:%.*]] = fmul <2 x double> [[TMP35]], [[TMP40]]
-; CHECK-PWR8-NEXT:    [[TMP42:%.*]] = fmul <2 x double> [[TMP33]], [[TMP41]]
-; CHECK-PWR8-NEXT:    [[TMP43:%.*]] = fmul <2 x double> [[TMP32]], [[TMP42]]
-; CHECK-PWR8-NEXT:    [[TMP44:%.*]] = fadd <2 x double> [[TMP43]], <double -3.000000e+00, double -3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP45:%.*]] = fsub <2 x double> [[REVERSE_1]], [[TMP32]]
-; CHECK-PWR8-NEXT:    [[TMP46:%.*]] = fadd <2 x double> [[TMP33]], [[TMP45]]
-; CHECK-PWR8-NEXT:    [[TMP47:%.*]] = fadd <2 x double> [[TMP46]], [[TMP40]]
-; CHECK-PWR8-NEXT:    [[TMP48:%.*]] = fadd <2 x double> [[TMP47]], [[TMP44]]
-; CHECK-PWR8-NEXT:    [[TMP49:%.*]] = fadd <2 x double> [[TMP48]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP50:%.*]] = fmul <2 x double> [[REVERSE_1]], [[TMP49]]
-; CHECK-PWR8-NEXT:    [[REVERSE1_1:%.*]] = shufflevector <2 x double> [[TMP50]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT:    store <2 x double> [[REVERSE1_1]], <2 x double>* [[TMP31]], align 8
-; CHECK-PWR8-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 4
-; CHECK-PWR8-NEXT:    [[NITER_NEXT_1]] = add nuw nsw i64 [[NITER]], 2
-; CHECK-PWR8-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
-; CHECK-PWR8-NEXT:    br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-PWR8:       middle.block.unr-lcssa:
-; CHECK-PWR8-NEXT:    [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
-; CHECK-PWR8-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
-; CHECK-PWR8:       vector.body.epil:
-; CHECK-PWR8-NEXT:    [[OFFSET_IDX_EPIL:%.*]] = sub nsw i64 [[TMP1]], [[INDEX_UNR]]
-; CHECK-PWR8-NEXT:    [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 [[OFFSET_IDX_EPIL]]
-; CHECK-PWR8-NEXT:    [[TMP52:%.*]] = bitcast double* [[TMP51]] to <2 x double>*
-; CHECK-PWR8-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <2 x double>, <2 x double>* [[TMP52]], align 8
-; CHECK-PWR8-NEXT:    [[REVERSE_EPIL:%.*]] = shufflevector <2 x double> [[WIDE_LOAD_EPIL]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT:    [[TMP53:%.*]] = fadd <2 x double> [[REVERSE_EPIL]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP54:%.*]] = fmul <2 x double> [[REVERSE_EPIL]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP55:%.*]] = fadd <2 x double> [[TMP53]], [[TMP54]]
-; CHECK-PWR8-NEXT:    [[TMP56:%.*]] = fadd <2 x double> [[TMP55]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP57:%.*]] = fmul <2 x double> [[TMP56]], <double 5.000000e-01, double 5.000000e-01>
-; CHECK-PWR8-NEXT:    [[TMP58:%.*]] = fadd <2 x double> [[TMP54]], [[TMP57]]
-; CHECK-PWR8-NEXT:    [[TMP59:%.*]] = fsub <2 x double> [[TMP58]], [[TMP53]]
-; CHECK-PWR8-NEXT:    [[TMP60:%.*]] = fadd <2 x double> [[REVERSE_EPIL]], [[TMP59]]
-; CHECK-PWR8-NEXT:    [[TMP61:%.*]] = fdiv <2 x double> [[TMP56]], [[TMP60]]
-; CHECK-PWR8-NEXT:    [[TMP62:%.*]] = fmul <2 x double> [[TMP56]], [[TMP61]]
-; CHECK-PWR8-NEXT:    [[TMP63:%.*]] = fmul <2 x double> [[TMP54]], [[TMP62]]
-; CHECK-PWR8-NEXT:    [[TMP64:%.*]] = fmul <2 x double> [[TMP53]], [[TMP63]]
-; CHECK-PWR8-NEXT:    [[TMP65:%.*]] = fadd <2 x double> [[TMP64]], <double -3.000000e+00, double -3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP66:%.*]] = fsub <2 x double> [[REVERSE_EPIL]], [[TMP53]]
-; CHECK-PWR8-NEXT:    [[TMP67:%.*]] = fadd <2 x double> [[TMP54]], [[TMP66]]
-; CHECK-PWR8-NEXT:    [[TMP68:%.*]] = fadd <2 x double> [[TMP67]], [[TMP61]]
-; CHECK-PWR8-NEXT:    [[TMP69:%.*]] = fadd <2 x double> [[TMP68]], [[TMP65]]
-; CHECK-PWR8-NEXT:    [[TMP70:%.*]] = fadd <2 x double> [[TMP69]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT:    [[TMP71:%.*]] = fmul <2 x double> [[REVERSE_EPIL]], [[TMP70]]
-; CHECK-PWR8-NEXT:    [[REVERSE1_EPIL:%.*]] = shufflevector <2 x double> [[TMP71]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT:    store <2 x double> [[REVERSE1_EPIL]], <2 x double>* [[TMP52]], align 8
-; CHECK-PWR8-NEXT:    br label [[MIDDLE_BLOCK]]
-; CHECK-PWR8:       middle.block:
-; CHECK-PWR8-NEXT:    [[IND_END:%.*]] = sub nsw i64 [[TMP1]], [[N_VEC]]
-; CHECK-PWR8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-PWR8-NEXT:    br i1 [[CMP_N]], label [[DOTLOOPEXIT:%.*]], label [[SCALAR_PH_PREHEADER]]
-; CHECK-PWR8:       scalar.ph.preheader:
-; CHECK-PWR8-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-PWR8-NEXT:    br label [[SCALAR_PH:%.*]]
-; CHECK-PWR8:       scalar.ph:
-; CHECK-PWR8-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[SCALAR_PH]] ], [ [[INDVARS_IV_PH]], [[SCALAR_PH_PREHEADER]] ]
-; CHECK-PWR8-NEXT:    [[TMP72:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; CHECK-PWR8-NEXT:    [[TMP73:%.*]] = load double, double* [[TMP72]], align 8
-; CHECK-PWR8-NEXT:    [[TMP74:%.*]] = fadd double [[TMP73]], 3.000000e+00
-; CHECK-PWR8-NEXT:    [[TMP75:%.*]] = fmul double [[TMP73]], 2.000000e+00
-; CHECK-PWR8-NEXT:    [[TMP76:%.*]] = fadd double [[TMP74]], [[TMP75]]
-; CHECK-PWR8-NEXT:    [[TMP77:%.*]] = fadd double [[TMP76]], 2.000000e+00
-; CHECK-PWR8-NEXT:    [[TMP78:%.*]] = fmul double [[TMP77]], 5.000000e-01
-; CHECK-PWR8-NEXT:    [[TMP79:%.*]] = fadd double [[TMP75]], [[TMP78]]
-; CHECK-PWR8-NEXT:    [[TMP80:%.*]] = fsub double [[TMP79]], [[TMP74]]
-; CHECK-PWR8-NEXT:    [[TMP81:%.*]] = fadd double [[TMP73]], [[TMP80]]
-; CHECK-PWR8-NEXT:    [[TMP82:%.*]] = fdiv double [[TMP77]], [[TMP81]]
-; CHECK-PWR8-NEXT:    [[TMP83:%.*]] = fmul double [[TMP77]], [[TMP82]]
-; CHECK-PWR8-NEXT:    [[TMP84:%.*]] = fmul double [[TMP75]], [[TMP83]]
-; CHECK-PWR8-NEXT:    [[TMP85:%.*]] = fmul double [[TMP74]], [[TMP84]]
-; CHECK-PWR8-NEXT:    [[TMP86:%.*]] = fadd double [[TMP85]], -3.000000e+00
-; CHECK-PWR8-NEXT:    [[TMP87:%.*]] = fsub double [[TMP73]], [[TMP74]]
-; CHECK-PWR8-NEXT:    [[TMP88:%.*]] = fadd double [[TMP75]], [[TMP87]]
-; CHECK-PWR8-NEXT:    [[TMP89:%.*]] = fadd double [[TMP88]], [[TMP82]]
-; CHECK-PWR8-NEXT:    [[TMP90:%.*]] = fadd double [[TMP89]], [[TMP86]]
-; CHECK-PWR8-NEXT:    [[TMP91:%.*]] = fadd double [[TMP90]], 3.000000e+00
-; CHECK-PWR8-NEXT:    [[TMP92:%.*]] = fmul double [[TMP73]], [[TMP91]]
-; CHECK-PWR8-NEXT:    store double [[TMP92]], double* [[TMP72]], align 8
-; CHECK-PWR8-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-PWR8-NEXT:    [[TMP93:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-PWR8-NEXT:    [[TMP94:%.*]] = icmp eq i32 [[TMP93]], 0
-; CHECK-PWR8-NEXT:    br i1 [[TMP94]], label [[DOTLOOPEXIT]], label [[SCALAR_PH]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-PWR8:       .loopexit:
-; CHECK-PWR8-NEXT:    ret void
-;
-; CHECK-PWR9-LABEL: @double_(
-; CHECK-PWR9-NEXT:    [[TMP1:%.*]] = sext i32 [[N:%.*]] to i64
-; CHECK-PWR9-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 1
-; CHECK-PWR9-NEXT:    [[LCMP_MOD_NOT_NOT:%.*]] = icmp eq i32 [[TMP2]], 0
-; CHECK-PWR9-NEXT:    br i1 [[LCMP_MOD_NOT_NOT]], label [[DOTPROL_LOOPEXIT_UNR_LCSSA:%.*]], label [[DOTPROL_LOOPEXIT:%.*]]
-; CHECK-PWR9:       .prol.loopexit.unr-lcssa:
-; CHECK-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP1]]
-; CHECK-PWR9-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
-; CHECK-PWR9-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP6:%.*]] = fmul double [[TMP4]], 2.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP7:%.*]] = fadd double [[TMP5]], [[TMP6]]
-; CHECK-PWR9-NEXT:    [[TMP8:%.*]] = fadd double [[TMP7]], 2.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP9:%.*]] = fmul double [[TMP8]], 5.000000e-01
-; CHECK-PWR9-NEXT:    [[TMP10:%.*]] = fadd double [[TMP6]], [[TMP9]]
-; CHECK-PWR9-NEXT:    [[TMP11:%.*]] = fsub double [[TMP10]], [[TMP5]]
-; CHECK-PWR9-NEXT:    [[TMP12:%.*]] = fadd double [[TMP4]], [[TMP11]]
-; CHECK-PWR9-NEXT:    [[TMP13:%.*]] = fdiv double [[TMP8]], [[TMP12]]
-; CHECK-PWR9-NEXT:    [[TMP14:%.*]] = fmul double [[TMP8]], [[TMP13]]
-; CHECK-PWR9-NEXT:    [[TMP15:%.*]] = fmul double [[TMP6]], [[TMP14]]
-; CHECK-PWR9-NEXT:    [[TMP16:%.*]] = fmul double [[TMP5]], [[TMP15]]
-; CHECK-PWR9-NEXT:    [[TMP17:%.*]] = fadd double [[TMP16]], -3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP18:%.*]] = fsub double [[TMP4]], [[TMP5]]
-; CHECK-PWR9-NEXT:    [[TMP19:%.*]] = fadd double [[TMP6]], [[TMP18]]
-; CHECK-PWR9-NEXT:    [[TMP20:%.*]] = fadd double [[TMP19]], [[TMP13]]
-; CHECK-PWR9-NEXT:    [[TMP21:%.*]] = fadd double [[TMP20]], [[TMP17]]
-; CHECK-PWR9-NEXT:    [[TMP22:%.*]] = fadd double [[TMP21]], 3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP23:%.*]] = fmul double [[TMP4]], [[TMP22]]
-; CHECK-PWR9-NEXT:    store double [[TMP23]], double* [[TMP3]], align 8
-; CHECK-PWR9-NEXT:    [[INDVARS_IV_NEXT_PROL:%.*]] = add nsw i64 [[TMP1]], -1
-; CHECK-PWR9-NEXT:    br label [[DOTPROL_LOOPEXIT]]
-; CHECK-PWR9:       .prol.loopexit:
-; CHECK-PWR9-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[DOTPROL_LOOPEXIT_UNR_LCSSA]] ]
-; CHECK-PWR9-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-PWR9-NEXT:    br i1 [[TMP24]], label [[DOTUNR_LCSSA:%.*]], label [[DOTNEW:%.*]]
-; CHECK-PWR9:       .new:
-; CHECK-PWR9-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[DOTNEW]] ], [ [[INDVARS_IV_UNR]], [[DOTPROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; CHECK-PWR9-NEXT:    [[TMP26:%.*]] = load double, double* [[TMP25]], align 8
-; CHECK-PWR9-NEXT:    [[TMP27:%.*]] = fadd double [[TMP26]], 3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP28:%.*]] = fmul double [[TMP26]], 2.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP29:%.*]] = fadd double [[TMP27]], [[TMP28]]
-; CHECK-PWR9-NEXT:    [[TMP30:%.*]] = fadd double [[TMP29]], 2.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP31:%.*]] = fmul double [[TMP30]], 5.000000e-01
-; CHECK-PWR9-NEXT:    [[TMP32:%.*]] = fadd double [[TMP28]], [[TMP31]]
-; CHECK-PWR9-NEXT:    [[TMP33:%.*]] = fsub double [[TMP32]], [[TMP27]]
-; CHECK-PWR9-NEXT:    [[TMP34:%.*]] = fadd double [[TMP26]], [[TMP33]]
-; CHECK-PWR9-NEXT:    [[TMP35:%.*]] = fdiv double [[TMP30]], [[TMP34]]
-; CHECK-PWR9-NEXT:    [[TMP36:%.*]] = fmul double [[TMP30]], [[TMP35]]
-; CHECK-PWR9-NEXT:    [[TMP37:%.*]] = fmul double [[TMP28]], [[TMP36]]
-; CHECK-PWR9-NEXT:    [[TMP38:%.*]] = fmul double [[TMP27]], [[TMP37]]
-; CHECK-PWR9-NEXT:    [[TMP39:%.*]] = fadd double [[TMP38]], -3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP40:%.*]] = fsub double [[TMP26]], [[TMP27]]
-; CHECK-PWR9-NEXT:    [[TMP41:%.*]] = fadd double [[TMP28]], [[TMP40]]
-; CHECK-PWR9-NEXT:    [[TMP42:%.*]] = fadd double [[TMP41]], [[TMP35]]
-; CHECK-PWR9-NEXT:    [[TMP43:%.*]] = fadd double [[TMP42]], [[TMP39]]
-; CHECK-PWR9-NEXT:    [[TMP44:%.*]] = fadd double [[TMP43]], 3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP45:%.*]] = fmul double [[TMP26]], [[TMP44]]
-; CHECK-PWR9-NEXT:    store double [[TMP45]], double* [[TMP25]], align 8
-; CHECK-PWR9-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-PWR9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-PWR9-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP46]], align 8
-; CHECK-PWR9-NEXT:    [[TMP48:%.*]] = fadd double [[TMP47]], 3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP49:%.*]] = fmul double [[TMP47]], 2.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP50:%.*]] = fadd double [[TMP48]], [[TMP49]]
-; CHECK-PWR9-NEXT:    [[TMP51:%.*]] = fadd double [[TMP50]], 2.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP52:%.*]] = fmul double [[TMP51]], 5.000000e-01
-; CHECK-PWR9-NEXT:    [[TMP53:%.*]] = fadd double [[TMP49]], [[TMP52]]
-; CHECK-PWR9-NEXT:    [[TMP54:%.*]] = fsub double [[TMP53]], [[TMP48]]
-; CHECK-PWR9-NEXT:    [[TMP55:%.*]] = fadd double [[TMP47]], [[TMP54]]
-; CHECK-PWR9-NEXT:    [[TMP56:%.*]] = fdiv double [[TMP51]], [[TMP55]]
-; CHECK-PWR9-NEXT:    [[TMP57:%.*]] = fmul double [[TMP51]], [[TMP56]]
-; CHECK-PWR9-NEXT:    [[TMP58:%.*]] = fmul double [[TMP49]], [[TMP57]]
-; CHECK-PWR9-NEXT:    [[TMP59:%.*]] = fmul double [[TMP48]], [[TMP58]]
-; CHECK-PWR9-NEXT:    [[TMP60:%.*]] = fadd double [[TMP59]], -3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP61:%.*]] = fsub double [[TMP47]], [[TMP48]]
-; CHECK-PWR9-NEXT:    [[TMP62:%.*]] = fadd double [[TMP49]], [[TMP61]]
-; CHECK-PWR9-NEXT:    [[TMP63:%.*]] = fadd double [[TMP62]], [[TMP56]]
-; CHECK-PWR9-NEXT:    [[TMP64:%.*]] = fadd double [[TMP63]], [[TMP60]]
-; CHECK-PWR9-NEXT:    [[TMP65:%.*]] = fadd double [[TMP64]], 3.000000e+00
-; CHECK-PWR9-NEXT:    [[TMP66:%.*]] = fmul double [[TMP47]], [[TMP65]]
-; CHECK-PWR9-NEXT:    store double [[TMP66]], double* [[TMP46]], align 8
-; CHECK-PWR9-NEXT:    [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2
-; CHECK-PWR9-NEXT:    [[TMP67:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-PWR9-NEXT:    [[TMP68:%.*]] = icmp eq i32 [[TMP67]], 0
-; CHECK-PWR9-NEXT:    br i1 [[TMP68]], label [[DOTUNR_LCSSA]], label [[DOTNEW]]
-; CHECK-PWR9:       .unr-lcssa:
-; CHECK-PWR9-NEXT:    ret void
-;
-
+;CHECK-LABEL: double_
+;CHECK-PWR8: LV(REG): VF = 2
+;CHECK-PWR8: LV(REG): Found max usage: 2 item
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
+;CHECK-PWR8: LV(REG): Found invariant usage: 1 item
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 1 registers
+
+;CHECK-PWR9: LV(REG): VF = 1
+;CHECK-PWR9: LV(REG): Found max usage: 2 item
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
+;CHECK-PWR9: LV(REG): Found invariant usage: 1 item
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
 
   %1 = sext i32 %n to i64
   br label %2
@@ -1134,240 +221,11 @@ define void @double_(double* nocapture %A, i32 %n) nounwind uwtable ssp {
 }
 
 define ppc_fp128 @fp128_(ppc_fp128* nocapture %n, ppc_fp128 %d) nounwind readonly {
-; CHECK-PWR8-LABEL: @fp128_(
-; CHECK-PWR8-NEXT:  entry:
-; CHECK-PWR8-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR8:       vector.body:
-; CHECK-PWR8-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI:%.*]] = phi ppc_fp128 [ [[D:%.*]], [[ENTRY]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI1:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI2:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI3:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI4:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI5:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI6:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI7:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI8:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI9:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI10:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[VEC_PHI11:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT:    [[INDUCTION12:%.*]] = or i32 [[INDEX]], 1
-; CHECK-PWR8-NEXT:    [[INDUCTION13:%.*]] = or i32 [[INDEX]], 2
-; CHECK-PWR8-NEXT:    [[INDUCTION14:%.*]] = or i32 [[INDEX]], 3
-; CHECK-PWR8-NEXT:    [[INDUCTION15:%.*]] = add nuw nsw i32 [[INDEX]], 4
-; CHECK-PWR8-NEXT:    [[INDUCTION16:%.*]] = add nuw nsw i32 [[INDEX]], 5
-; CHECK-PWR8-NEXT:    [[INDUCTION17:%.*]] = add nuw nsw i32 [[INDEX]], 6
-; CHECK-PWR8-NEXT:    [[INDUCTION18:%.*]] = add nuw nsw i32 [[INDEX]], 7
-; CHECK-PWR8-NEXT:    [[INDUCTION19:%.*]] = add nuw nsw i32 [[INDEX]], 8
-; CHECK-PWR8-NEXT:    [[INDUCTION20:%.*]] = add nuw nsw i32 [[INDEX]], 9
-; CHECK-PWR8-NEXT:    [[INDUCTION21:%.*]] = add nuw nsw i32 [[INDEX]], 10
-; CHECK-PWR8-NEXT:    [[INDUCTION22:%.*]] = add nuw nsw i32 [[INDEX]], 11
-; CHECK-PWR8-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-PWR8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N:%.*]], i64 [[TMP0]]
-; CHECK-PWR8-NEXT:    [[TMP2:%.*]] = zext i32 [[INDUCTION12]] to i64
-; CHECK-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP2]]
-; CHECK-PWR8-NEXT:    [[TMP4:%.*]] = zext i32 [[INDUCTION13]] to i64
-; CHECK-PWR8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP4]]
-; CHECK-PWR8-NEXT:    [[TMP6:%.*]] = zext i32 [[INDUCTION14]] to i64
-; CHECK-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP6]]
-; CHECK-PWR8-NEXT:    [[TMP8:%.*]] = zext i32 [[INDUCTION15]] to i64
-; CHECK-PWR8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP8]]
-; CHECK-PWR8-NEXT:    [[TMP10:%.*]] = zext i32 [[INDUCTION16]] to i64
-; CHECK-PWR8-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP10]]
-; CHECK-PWR8-NEXT:    [[TMP12:%.*]] = zext i32 [[INDUCTION17]] to i64
-; CHECK-PWR8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP12]]
-; CHECK-PWR8-NEXT:    [[TMP14:%.*]] = zext i32 [[INDUCTION18]] to i64
-; CHECK-PWR8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP14]]
-; CHECK-PWR8-NEXT:    [[TMP16:%.*]] = zext i32 [[INDUCTION19]] to i64
-; CHECK-PWR8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP16]]
-; CHECK-PWR8-NEXT:    [[TMP18:%.*]] = zext i32 [[INDUCTION20]] to i64
-; CHECK-PWR8-NEXT:    [[TMP19:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP18]]
-; CHECK-PWR8-NEXT:    [[TMP20:%.*]] = zext i32 [[INDUCTION21]] to i64
-; CHECK-PWR8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP20]]
-; CHECK-PWR8-NEXT:    [[TMP22:%.*]] = zext i32 [[INDUCTION22]] to i64
-; CHECK-PWR8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP22]]
-; CHECK-PWR8-NEXT:    [[TMP24:%.*]] = load ppc_fp128, ppc_fp128* [[TMP1]], align 8
-; CHECK-PWR8-NEXT:    [[TMP25:%.*]] = load ppc_fp128, ppc_fp128* [[TMP3]], align 8
-; CHECK-PWR8-NEXT:    [[TMP26:%.*]] = load ppc_fp128, ppc_fp128* [[TMP5]], align 8
-; CHECK-PWR8-NEXT:    [[TMP27:%.*]] = load ppc_fp128, ppc_fp128* [[TMP7]], align 8
-; CHECK-PWR8-NEXT:    [[TMP28:%.*]] = load ppc_fp128, ppc_fp128* [[TMP9]], align 8
-; CHECK-PWR8-NEXT:    [[TMP29:%.*]] = load ppc_fp128, ppc_fp128* [[TMP11]], align 8
-; CHECK-PWR8-NEXT:    [[TMP30:%.*]] = load ppc_fp128, ppc_fp128* [[TMP13]], align 8
-; CHECK-PWR8-NEXT:    [[TMP31:%.*]] = load ppc_fp128, ppc_fp128* [[TMP15]], align 8
-; CHECK-PWR8-NEXT:    [[TMP32:%.*]] = load ppc_fp128, ppc_fp128* [[TMP17]], align 8
-; CHECK-PWR8-NEXT:    [[TMP33:%.*]] = load ppc_fp128, ppc_fp128* [[TMP19]], align 8
-; CHECK-PWR8-NEXT:    [[TMP34:%.*]] = load ppc_fp128, ppc_fp128* [[TMP21]], align 8
-; CHECK-PWR8-NEXT:    [[TMP35:%.*]] = load ppc_fp128, ppc_fp128* [[TMP23]], align 8
-; CHECK-PWR8-NEXT:    [[TMP36]] = fsub fast ppc_fp128 [[VEC_PHI]], [[TMP24]]
-; CHECK-PWR8-NEXT:    [[TMP37]] = fsub fast ppc_fp128 [[VEC_PHI1]], [[TMP25]]
-; CHECK-PWR8-NEXT:    [[TMP38]] = fsub fast ppc_fp128 [[VEC_PHI2]], [[TMP26]]
-; CHECK-PWR8-NEXT:    [[TMP39]] = fsub fast ppc_fp128 [[VEC_PHI3]], [[TMP27]]
-; CHECK-PWR8-NEXT:    [[TMP40]] = fsub fast ppc_fp128 [[VEC_PHI4]], [[TMP28]]
-; CHECK-PWR8-NEXT:    [[TMP41]] = fsub fast ppc_fp128 [[VEC_PHI5]], [[TMP29]]
-; CHECK-PWR8-NEXT:    [[TMP42]] = fsub fast ppc_fp128 [[VEC_PHI6]], [[TMP30]]
-; CHECK-PWR8-NEXT:    [[TMP43]] = fsub fast ppc_fp128 [[VEC_PHI7]], [[TMP31]]
-; CHECK-PWR8-NEXT:    [[TMP44]] = fsub fast ppc_fp128 [[VEC_PHI8]], [[TMP32]]
-; CHECK-PWR8-NEXT:    [[TMP45]] = fsub fast ppc_fp128 [[VEC_PHI9]], [[TMP33]]
-; CHECK-PWR8-NEXT:    [[TMP46]] = fsub fast ppc_fp128 [[VEC_PHI10]], [[TMP34]]
-; CHECK-PWR8-NEXT:    [[TMP47]] = fsub fast ppc_fp128 [[VEC_PHI11]], [[TMP35]]
-; CHECK-PWR8-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 12
-; CHECK-PWR8-NEXT:    [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2040
-; CHECK-PWR8-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-PWR8:       middle.block:
-; CHECK-PWR8-NEXT:    [[BIN_RDX:%.*]] = fadd fast ppc_fp128 [[TMP37]], [[TMP36]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX23:%.*]] = fadd fast ppc_fp128 [[BIN_RDX]], [[TMP38]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX24:%.*]] = fadd fast ppc_fp128 [[BIN_RDX23]], [[TMP39]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX25:%.*]] = fadd fast ppc_fp128 [[BIN_RDX24]], [[TMP40]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX26:%.*]] = fadd fast ppc_fp128 [[BIN_RDX25]], [[TMP41]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX27:%.*]] = fadd fast ppc_fp128 [[BIN_RDX26]], [[TMP42]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX28:%.*]] = fadd fast ppc_fp128 [[BIN_RDX27]], [[TMP43]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX29:%.*]] = fadd fast ppc_fp128 [[BIN_RDX28]], [[TMP44]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX30:%.*]] = fadd fast ppc_fp128 [[BIN_RDX29]], [[TMP45]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX31:%.*]] = fadd fast ppc_fp128 [[BIN_RDX30]], [[TMP46]]
-; CHECK-PWR8-NEXT:    [[BIN_RDX32:%.*]] = fadd fast ppc_fp128 [[BIN_RDX31]], [[TMP47]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2040
-; CHECK-PWR8-NEXT:    [[TMP49:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX]], align 8
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2041
-; CHECK-PWR8-NEXT:    [[TMP50:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_1]], align 8
-; CHECK-PWR8-NEXT:    [[TMP51:%.*]] = fadd fast ppc_fp128 [[TMP49]], [[TMP50]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2042
-; CHECK-PWR8-NEXT:    [[TMP52:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_2]], align 8
-; CHECK-PWR8-NEXT:    [[TMP53:%.*]] = fadd fast ppc_fp128 [[TMP51]], [[TMP52]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2043
-; CHECK-PWR8-NEXT:    [[TMP54:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_3]], align 8
-; CHECK-PWR8-NEXT:    [[TMP55:%.*]] = fadd fast ppc_fp128 [[TMP53]], [[TMP54]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2044
-; CHECK-PWR8-NEXT:    [[TMP56:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_4]], align 8
-; CHECK-PWR8-NEXT:    [[TMP57:%.*]] = fadd fast ppc_fp128 [[TMP55]], [[TMP56]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2045
-; CHECK-PWR8-NEXT:    [[TMP58:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_5]], align 8
-; CHECK-PWR8-NEXT:    [[TMP59:%.*]] = fadd fast ppc_fp128 [[TMP57]], [[TMP58]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2046
-; CHECK-PWR8-NEXT:    [[TMP60:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_6]], align 8
-; CHECK-PWR8-NEXT:    [[TMP61:%.*]] = fadd fast ppc_fp128 [[TMP59]], [[TMP60]]
-; CHECK-PWR8-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2047
-; CHECK-PWR8-NEXT:    [[TMP62:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_7]], align 8
-; CHECK-PWR8-NEXT:    [[TMP63:%.*]] = fadd fast ppc_fp128 [[TMP61]], [[TMP62]]
-; CHECK-PWR8-NEXT:    [[SUB_7:%.*]] = fsub fast ppc_fp128 [[BIN_RDX32]], [[TMP63]]
-; CHECK-PWR8-NEXT:    ret ppc_fp128 [[SUB_7]]
-;
-; CHECK-PWR9-LABEL: @fp128_(
-; CHECK-PWR9-NEXT:  entry:
-; CHECK-PWR9-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR9:       vector.body:
-; CHECK-PWR9-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI:%.*]] = phi ppc_fp128 [ [[D:%.*]], [[ENTRY]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI1:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI2:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI3:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI4:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI5:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI6:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI7:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI8:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI9:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI10:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[VEC_PHI11:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT:    [[INDUCTION12:%.*]] = or i32 [[INDEX]], 1
-; CHECK-PWR9-NEXT:    [[INDUCTION13:%.*]] = or i32 [[INDEX]], 2
-; CHECK-PWR9-NEXT:    [[INDUCTION14:%.*]] = or i32 [[INDEX]], 3
-; CHECK-PWR9-NEXT:    [[INDUCTION15:%.*]] = add nuw nsw i32 [[INDEX]], 4
-; CHECK-PWR9-NEXT:    [[INDUCTION16:%.*]] = add nuw nsw i32 [[INDEX]], 5
-; CHECK-PWR9-NEXT:    [[INDUCTION17:%.*]] = add nuw nsw i32 [[INDEX]], 6
-; CHECK-PWR9-NEXT:    [[INDUCTION18:%.*]] = add nuw nsw i32 [[INDEX]], 7
-; CHECK-PWR9-NEXT:    [[INDUCTION19:%.*]] = add nuw nsw i32 [[INDEX]], 8
-; CHECK-PWR9-NEXT:    [[INDUCTION20:%.*]] = add nuw nsw i32 [[INDEX]], 9
-; CHECK-PWR9-NEXT:    [[INDUCTION21:%.*]] = add nuw nsw i32 [[INDEX]], 10
-; CHECK-PWR9-NEXT:    [[INDUCTION22:%.*]] = add nuw nsw i32 [[INDEX]], 11
-; CHECK-PWR9-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-PWR9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N:%.*]], i64 [[TMP0]]
-; CHECK-PWR9-NEXT:    [[TMP2:%.*]] = zext i32 [[INDUCTION12]] to i64
-; CHECK-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP2]]
-; CHECK-PWR9-NEXT:    [[TMP4:%.*]] = zext i32 [[INDUCTION13]] to i64
-; CHECK-PWR9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP4]]
-; CHECK-PWR9-NEXT:    [[TMP6:%.*]] = zext i32 [[INDUCTION14]] to i64
-; CHECK-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP6]]
-; CHECK-PWR9-NEXT:    [[TMP8:%.*]] = zext i32 [[INDUCTION15]] to i64
-; CHECK-PWR9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP8]]
-; CHECK-PWR9-NEXT:    [[TMP10:%.*]] = zext i32 [[INDUCTION16]] to i64
-; CHECK-PWR9-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP10]]
-; CHECK-PWR9-NEXT:    [[TMP12:%.*]] = zext i32 [[INDUCTION17]] to i64
-; CHECK-PWR9-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP12]]
-; CHECK-PWR9-NEXT:    [[TMP14:%.*]] = zext i32 [[INDUCTION18]] to i64
-; CHECK-PWR9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP14]]
-; CHECK-PWR9-NEXT:    [[TMP16:%.*]] = zext i32 [[INDUCTION19]] to i64
-; CHECK-PWR9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP16]]
-; CHECK-PWR9-NEXT:    [[TMP18:%.*]] = zext i32 [[INDUCTION20]] to i64
-; CHECK-PWR9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP18]]
-; CHECK-PWR9-NEXT:    [[TMP20:%.*]] = zext i32 [[INDUCTION21]] to i64
-; CHECK-PWR9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP20]]
-; CHECK-PWR9-NEXT:    [[TMP22:%.*]] = zext i32 [[INDUCTION22]] to i64
-; CHECK-PWR9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP22]]
-; CHECK-PWR9-NEXT:    [[TMP24:%.*]] = load ppc_fp128, ppc_fp128* [[TMP1]], align 8
-; CHECK-PWR9-NEXT:    [[TMP25:%.*]] = load ppc_fp128, ppc_fp128* [[TMP3]], align 8
-; CHECK-PWR9-NEXT:    [[TMP26:%.*]] = load ppc_fp128, ppc_fp128* [[TMP5]], align 8
-; CHECK-PWR9-NEXT:    [[TMP27:%.*]] = load ppc_fp128, ppc_fp128* [[TMP7]], align 8
-; CHECK-PWR9-NEXT:    [[TMP28:%.*]] = load ppc_fp128, ppc_fp128* [[TMP9]], align 8
-; CHECK-PWR9-NEXT:    [[TMP29:%.*]] = load ppc_fp128, ppc_fp128* [[TMP11]], align 8
-; CHECK-PWR9-NEXT:    [[TMP30:%.*]] = load ppc_fp128, ppc_fp128* [[TMP13]], align 8
-; CHECK-PWR9-NEXT:    [[TMP31:%.*]] = load ppc_fp128, ppc_fp128* [[TMP15]], align 8
-; CHECK-PWR9-NEXT:    [[TMP32:%.*]] = load ppc_fp128, ppc_fp128* [[TMP17]], align 8
-; CHECK-PWR9-NEXT:    [[TMP33:%.*]] = load ppc_fp128, ppc_fp128* [[TMP19]], align 8
-; CHECK-PWR9-NEXT:    [[TMP34:%.*]] = load ppc_fp128, ppc_fp128* [[TMP21]], align 8
-; CHECK-PWR9-NEXT:    [[TMP35:%.*]] = load ppc_fp128, ppc_fp128* [[TMP23]], align 8
-; CHECK-PWR9-NEXT:    [[TMP36]] = fsub fast ppc_fp128 [[VEC_PHI]], [[TMP24]]
-; CHECK-PWR9-NEXT:    [[TMP37]] = fsub fast ppc_fp128 [[VEC_PHI1]], [[TMP25]]
-; CHECK-PWR9-NEXT:    [[TMP38]] = fsub fast ppc_fp128 [[VEC_PHI2]], [[TMP26]]
-; CHECK-PWR9-NEXT:    [[TMP39]] = fsub fast ppc_fp128 [[VEC_PHI3]], [[TMP27]]
-; CHECK-PWR9-NEXT:    [[TMP40]] = fsub fast ppc_fp128 [[VEC_PHI4]], [[TMP28]]
-; CHECK-PWR9-NEXT:    [[TMP41]] = fsub fast ppc_fp128 [[VEC_PHI5]], [[TMP29]]
-; CHECK-PWR9-NEXT:    [[TMP42]] = fsub fast ppc_fp128 [[VEC_PHI6]], [[TMP30]]
-; CHECK-PWR9-NEXT:    [[TMP43]] = fsub fast ppc_fp128 [[VEC_PHI7]], [[TMP31]]
-; CHECK-PWR9-NEXT:    [[TMP44]] = fsub fast ppc_fp128 [[VEC_PHI8]], [[TMP32]]
-; CHECK-PWR9-NEXT:    [[TMP45]] = fsub fast ppc_fp128 [[VEC_PHI9]], [[TMP33]]
-; CHECK-PWR9-NEXT:    [[TMP46]] = fsub fast ppc_fp128 [[VEC_PHI10]], [[TMP34]]
-; CHECK-PWR9-NEXT:    [[TMP47]] = fsub fast ppc_fp128 [[VEC_PHI11]], [[TMP35]]
-; CHECK-PWR9-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 12
-; CHECK-PWR9-NEXT:    [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2040
-; CHECK-PWR9-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-PWR9:       middle.block:
-; CHECK-PWR9-NEXT:    [[BIN_RDX:%.*]] = fadd fast ppc_fp128 [[TMP37]], [[TMP36]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX23:%.*]] = fadd fast ppc_fp128 [[BIN_RDX]], [[TMP38]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX24:%.*]] = fadd fast ppc_fp128 [[BIN_RDX23]], [[TMP39]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX25:%.*]] = fadd fast ppc_fp128 [[BIN_RDX24]], [[TMP40]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX26:%.*]] = fadd fast ppc_fp128 [[BIN_RDX25]], [[TMP41]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX27:%.*]] = fadd fast ppc_fp128 [[BIN_RDX26]], [[TMP42]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX28:%.*]] = fadd fast ppc_fp128 [[BIN_RDX27]], [[TMP43]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX29:%.*]] = fadd fast ppc_fp128 [[BIN_RDX28]], [[TMP44]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX30:%.*]] = fadd fast ppc_fp128 [[BIN_RDX29]], [[TMP45]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX31:%.*]] = fadd fast ppc_fp128 [[BIN_RDX30]], [[TMP46]]
-; CHECK-PWR9-NEXT:    [[BIN_RDX32:%.*]] = fadd fast ppc_fp128 [[BIN_RDX31]], [[TMP47]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2040
-; CHECK-PWR9-NEXT:    [[TMP49:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX]], align 8
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2041
-; CHECK-PWR9-NEXT:    [[TMP50:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_1]], align 8
-; CHECK-PWR9-NEXT:    [[TMP51:%.*]] = fadd fast ppc_fp128 [[TMP49]], [[TMP50]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2042
-; CHECK-PWR9-NEXT:    [[TMP52:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_2]], align 8
-; CHECK-PWR9-NEXT:    [[TMP53:%.*]] = fadd fast ppc_fp128 [[TMP51]], [[TMP52]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2043
-; CHECK-PWR9-NEXT:    [[TMP54:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_3]], align 8
-; CHECK-PWR9-NEXT:    [[TMP55:%.*]] = fadd fast ppc_fp128 [[TMP53]], [[TMP54]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2044
-; CHECK-PWR9-NEXT:    [[TMP56:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_4]], align 8
-; CHECK-PWR9-NEXT:    [[TMP57:%.*]] = fadd fast ppc_fp128 [[TMP55]], [[TMP56]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2045
-; CHECK-PWR9-NEXT:    [[TMP58:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_5]], align 8
-; CHECK-PWR9-NEXT:    [[TMP59:%.*]] = fadd fast ppc_fp128 [[TMP57]], [[TMP58]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2046
-; CHECK-PWR9-NEXT:    [[TMP60:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_6]], align 8
-; CHECK-PWR9-NEXT:    [[TMP61:%.*]] = fadd fast ppc_fp128 [[TMP59]], [[TMP60]]
-; CHECK-PWR9-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2047
-; CHECK-PWR9-NEXT:    [[TMP62:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_7]], align 8
-; CHECK-PWR9-NEXT:    [[TMP63:%.*]] = fadd fast ppc_fp128 [[TMP61]], [[TMP62]]
-; CHECK-PWR9-NEXT:    [[SUB_7:%.*]] = fsub fast ppc_fp128 [[BIN_RDX32]], [[TMP63]]
-; CHECK-PWR9-NEXT:    ret ppc_fp128 [[SUB_7]]
-;
+;CHECK-LABEL: fp128_
+;CHECK: LV(REG): VF = 1
+;CHECK: LV(REG): Found max usage: 2 item
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK: LV(REG): RegisterClass: PPC::VRRC, 2 registers
 entry:
   br label %for.body
 
@@ -1387,262 +245,11 @@ for.end:                                          ; preds = %for.body
 
 
 define void @fp16_(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
-; CHECK-PWR8-LABEL: @fp16_(
-; CHECK-PWR8-NEXT:  entry:
-; CHECK-PWR8-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[SCALE_COERCE:%.*]] to i16
-; CHECK-PWR8-NEXT:    [[TMP0:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-; CHECK-PWR8-NEXT:    [[MUL:%.*]] = mul i32 [[NUMCOLS:%.*]], [[NUMROWS:%.*]]
-; CHECK-PWR8-NEXT:    [[CMP26:%.*]] = icmp ult i32 [[MUL]], 4
-; CHECK-PWR8-NEXT:    br i1 [[CMP26]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK-PWR8:       while.body.preheader:
-; CHECK-PWR8-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 2
-; CHECK-PWR8-NEXT:    [[TMP1:%.*]] = add nsw i32 [[SHR]], -1
-; CHECK-PWR8-NEXT:    [[XTRAITER:%.*]] = and i32 [[SHR]], 7
-; CHECK-PWR8-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
-; CHECK-PWR8-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT:%.*]], label [[WHILE_BODY_PROL:%.*]]
-; CHECK-PWR8:       while.body.prol:
-; CHECK-PWR8-NEXT:    [[PIN_ADDR_029_PROL:%.*]] = phi half* [ [[ADD_PTR_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[PIN:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT:    [[POUT_ADDR_028_PROL:%.*]] = phi half* [ [[ADD_PTR7_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[POUT:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT:    [[BLKCNT_027_PROL:%.*]] = phi i32 [ [[DEC_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT:    [[PROL_ITER:%.*]] = phi i32 [ [[PROL_ITER_NEXT:%.*]], [[WHILE_BODY_PROL]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT:    [[TMP2:%.*]] = load half, half* [[PIN_ADDR_029_PROL]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 1
-; CHECK-PWR8-NEXT:    [[TMP3:%.*]] = load half, half* [[ARRAYIDX2_PROL]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_PROL:%.*]] = fmul half [[TMP2]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_PROL:%.*]] = fmul half [[TMP3]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_PROL]], half* [[POUT_ADDR_028_PROL]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_PROL:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 1
-; CHECK-PWR8-NEXT:    store half [[MUL4_PROL]], half* [[ARRAYIDX6_PROL]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_PROL]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_PROL]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 2
-; CHECK-PWR8-NEXT:    [[DEC_PROL]] = add nsw i32 [[BLKCNT_027_PROL]], -1
-; CHECK-PWR8-NEXT:    [[PROL_ITER_NEXT]] = add i32 [[PROL_ITER]], 1
-; CHECK-PWR8-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-PWR8-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT]], label [[WHILE_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-PWR8:       while.body.prol.loopexit:
-; CHECK-PWR8-NEXT:    [[PIN_ADDR_029_UNR:%.*]] = phi half* [ [[PIN]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR8-NEXT:    [[POUT_ADDR_028_UNR:%.*]] = phi half* [ [[POUT]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR7_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR8-NEXT:    [[BLKCNT_027_UNR:%.*]] = phi i32 [ [[SHR]], [[WHILE_BODY_PREHEADER]] ], [ [[DEC_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR8-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP1]], 7
-; CHECK-PWR8-NEXT:    br i1 [[TMP4]], label [[WHILE_END]], label [[WHILE_BODY:%.*]]
-; CHECK-PWR8:       while.body:
-; CHECK-PWR8-NEXT:    [[PIN_ADDR_029:%.*]] = phi half* [ [[ADD_PTR_7:%.*]], [[WHILE_BODY]] ], [ [[PIN_ADDR_029_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR8-NEXT:    [[POUT_ADDR_028:%.*]] = phi half* [ [[ADD_PTR7_7:%.*]], [[WHILE_BODY]] ], [ [[POUT_ADDR_028_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR8-NEXT:    [[BLKCNT_027:%.*]] = phi i32 [ [[DEC_7:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_027_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR8-NEXT:    [[TMP5:%.*]] = load half, half* [[PIN_ADDR_029]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 1
-; CHECK-PWR8-NEXT:    [[TMP6:%.*]] = load half, half* [[ARRAYIDX2]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3:%.*]] = fmul half [[TMP5]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4:%.*]] = fmul half [[TMP6]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3]], half* [[POUT_ADDR_028]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 1
-; CHECK-PWR8-NEXT:    store half [[MUL4]], half* [[ARRAYIDX6]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 2
-; CHECK-PWR8-NEXT:    [[TMP7:%.*]] = load half, half* [[ADD_PTR]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 3
-; CHECK-PWR8-NEXT:    [[TMP8:%.*]] = load half, half* [[ARRAYIDX2_1]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_1:%.*]] = fmul half [[TMP7]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_1:%.*]] = fmul half [[TMP8]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_1]], half* [[ADD_PTR7]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 3
-; CHECK-PWR8-NEXT:    store half [[MUL4_1]], half* [[ARRAYIDX6_1]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 4
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 4
-; CHECK-PWR8-NEXT:    [[TMP9:%.*]] = load half, half* [[ADD_PTR_1]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 5
-; CHECK-PWR8-NEXT:    [[TMP10:%.*]] = load half, half* [[ARRAYIDX2_2]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_2:%.*]] = fmul half [[TMP9]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_2:%.*]] = fmul half [[TMP10]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_2]], half* [[ADD_PTR7_1]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 5
-; CHECK-PWR8-NEXT:    store half [[MUL4_2]], half* [[ARRAYIDX6_2]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 6
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 6
-; CHECK-PWR8-NEXT:    [[TMP11:%.*]] = load half, half* [[ADD_PTR_2]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 7
-; CHECK-PWR8-NEXT:    [[TMP12:%.*]] = load half, half* [[ARRAYIDX2_3]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_3:%.*]] = fmul half [[TMP11]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_3:%.*]] = fmul half [[TMP12]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_3]], half* [[ADD_PTR7_2]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 7
-; CHECK-PWR8-NEXT:    store half [[MUL4_3]], half* [[ARRAYIDX6_3]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 8
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 8
-; CHECK-PWR8-NEXT:    [[TMP13:%.*]] = load half, half* [[ADD_PTR_3]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 9
-; CHECK-PWR8-NEXT:    [[TMP14:%.*]] = load half, half* [[ARRAYIDX2_4]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_4:%.*]] = fmul half [[TMP13]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_4:%.*]] = fmul half [[TMP14]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_4]], half* [[ADD_PTR7_3]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 9
-; CHECK-PWR8-NEXT:    store half [[MUL4_4]], half* [[ARRAYIDX6_4]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 10
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 10
-; CHECK-PWR8-NEXT:    [[TMP15:%.*]] = load half, half* [[ADD_PTR_4]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 11
-; CHECK-PWR8-NEXT:    [[TMP16:%.*]] = load half, half* [[ARRAYIDX2_5]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_5:%.*]] = fmul half [[TMP15]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_5:%.*]] = fmul half [[TMP16]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_5]], half* [[ADD_PTR7_4]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 11
-; CHECK-PWR8-NEXT:    store half [[MUL4_5]], half* [[ARRAYIDX6_5]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 12
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 12
-; CHECK-PWR8-NEXT:    [[TMP17:%.*]] = load half, half* [[ADD_PTR_5]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 13
-; CHECK-PWR8-NEXT:    [[TMP18:%.*]] = load half, half* [[ARRAYIDX2_6]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_6:%.*]] = fmul half [[TMP17]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_6:%.*]] = fmul half [[TMP18]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_6]], half* [[ADD_PTR7_5]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 13
-; CHECK-PWR8-NEXT:    store half [[MUL4_6]], half* [[ARRAYIDX6_6]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 14
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 14
-; CHECK-PWR8-NEXT:    [[TMP19:%.*]] = load half, half* [[ADD_PTR_6]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 15
-; CHECK-PWR8-NEXT:    [[TMP20:%.*]] = load half, half* [[ARRAYIDX2_7]], align 2
-; CHECK-PWR8-NEXT:    [[MUL3_7:%.*]] = fmul half [[TMP19]], [[TMP0]]
-; CHECK-PWR8-NEXT:    [[MUL4_7:%.*]] = fmul half [[TMP20]], [[TMP0]]
-; CHECK-PWR8-NEXT:    store half [[MUL3_7]], half* [[ADD_PTR7_6]], align 2
-; CHECK-PWR8-NEXT:    [[ARRAYIDX6_7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 15
-; CHECK-PWR8-NEXT:    store half [[MUL4_7]], half* [[ARRAYIDX6_7]], align 2
-; CHECK-PWR8-NEXT:    [[ADD_PTR_7]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 16
-; CHECK-PWR8-NEXT:    [[ADD_PTR7_7]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 16
-; CHECK-PWR8-NEXT:    [[DEC_7]] = add nsw i32 [[BLKCNT_027]], -8
-; CHECK-PWR8-NEXT:    [[CMP_7:%.*]] = icmp eq i32 [[DEC_7]], 0
-; CHECK-PWR8-NEXT:    br i1 [[CMP_7]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK-PWR8:       while.end:
-; CHECK-PWR8-NEXT:    ret void
-;
-; CHECK-PWR9-LABEL: @fp16_(
-; CHECK-PWR9-NEXT:  entry:
-; CHECK-PWR9-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[SCALE_COERCE:%.*]] to i16
-; CHECK-PWR9-NEXT:    [[TMP0:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-; CHECK-PWR9-NEXT:    [[MUL:%.*]] = mul i32 [[NUMCOLS:%.*]], [[NUMROWS:%.*]]
-; CHECK-PWR9-NEXT:    [[CMP26:%.*]] = icmp ult i32 [[MUL]], 4
-; CHECK-PWR9-NEXT:    br i1 [[CMP26]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK-PWR9:       while.body.preheader:
-; CHECK-PWR9-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 2
-; CHECK-PWR9-NEXT:    [[TMP1:%.*]] = add nsw i32 [[SHR]], -1
-; CHECK-PWR9-NEXT:    [[XTRAITER:%.*]] = and i32 [[SHR]], 7
-; CHECK-PWR9-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
-; CHECK-PWR9-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT:%.*]], label [[WHILE_BODY_PROL:%.*]]
-; CHECK-PWR9:       while.body.prol:
-; CHECK-PWR9-NEXT:    [[PIN_ADDR_029_PROL:%.*]] = phi half* [ [[ADD_PTR_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[PIN:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT:    [[POUT_ADDR_028_PROL:%.*]] = phi half* [ [[ADD_PTR7_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[POUT:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT:    [[BLKCNT_027_PROL:%.*]] = phi i32 [ [[DEC_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT:    [[PROL_ITER:%.*]] = phi i32 [ [[PROL_ITER_NEXT:%.*]], [[WHILE_BODY_PROL]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT:    [[TMP2:%.*]] = load half, half* [[PIN_ADDR_029_PROL]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 1
-; CHECK-PWR9-NEXT:    [[TMP3:%.*]] = load half, half* [[ARRAYIDX2_PROL]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_PROL:%.*]] = fmul half [[TMP2]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_PROL:%.*]] = fmul half [[TMP3]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_PROL]], half* [[POUT_ADDR_028_PROL]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_PROL:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 1
-; CHECK-PWR9-NEXT:    store half [[MUL4_PROL]], half* [[ARRAYIDX6_PROL]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_PROL]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_PROL]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 2
-; CHECK-PWR9-NEXT:    [[DEC_PROL]] = add nsw i32 [[BLKCNT_027_PROL]], -1
-; CHECK-PWR9-NEXT:    [[PROL_ITER_NEXT]] = add i32 [[PROL_ITER]], 1
-; CHECK-PWR9-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-PWR9-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT]], label [[WHILE_BODY_PROL]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-PWR9:       while.body.prol.loopexit:
-; CHECK-PWR9-NEXT:    [[PIN_ADDR_029_UNR:%.*]] = phi half* [ [[PIN]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR9-NEXT:    [[POUT_ADDR_028_UNR:%.*]] = phi half* [ [[POUT]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR7_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR9-NEXT:    [[BLKCNT_027_UNR:%.*]] = phi i32 [ [[SHR]], [[WHILE_BODY_PREHEADER]] ], [ [[DEC_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR9-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP1]], 7
-; CHECK-PWR9-NEXT:    br i1 [[TMP4]], label [[WHILE_END]], label [[WHILE_BODY:%.*]]
-; CHECK-PWR9:       while.body:
-; CHECK-PWR9-NEXT:    [[PIN_ADDR_029:%.*]] = phi half* [ [[ADD_PTR_7:%.*]], [[WHILE_BODY]] ], [ [[PIN_ADDR_029_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT:    [[POUT_ADDR_028:%.*]] = phi half* [ [[ADD_PTR7_7:%.*]], [[WHILE_BODY]] ], [ [[POUT_ADDR_028_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT:    [[BLKCNT_027:%.*]] = phi i32 [ [[DEC_7:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_027_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT:    [[TMP5:%.*]] = load half, half* [[PIN_ADDR_029]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 1
-; CHECK-PWR9-NEXT:    [[TMP6:%.*]] = load half, half* [[ARRAYIDX2]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3:%.*]] = fmul half [[TMP5]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4:%.*]] = fmul half [[TMP6]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3]], half* [[POUT_ADDR_028]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 1
-; CHECK-PWR9-NEXT:    store half [[MUL4]], half* [[ARRAYIDX6]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 2
-; CHECK-PWR9-NEXT:    [[TMP7:%.*]] = load half, half* [[ADD_PTR]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 3
-; CHECK-PWR9-NEXT:    [[TMP8:%.*]] = load half, half* [[ARRAYIDX2_1]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_1:%.*]] = fmul half [[TMP7]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_1:%.*]] = fmul half [[TMP8]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_1]], half* [[ADD_PTR7]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 3
-; CHECK-PWR9-NEXT:    store half [[MUL4_1]], half* [[ARRAYIDX6_1]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 4
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 4
-; CHECK-PWR9-NEXT:    [[TMP9:%.*]] = load half, half* [[ADD_PTR_1]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 5
-; CHECK-PWR9-NEXT:    [[TMP10:%.*]] = load half, half* [[ARRAYIDX2_2]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_2:%.*]] = fmul half [[TMP9]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_2:%.*]] = fmul half [[TMP10]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_2]], half* [[ADD_PTR7_1]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 5
-; CHECK-PWR9-NEXT:    store half [[MUL4_2]], half* [[ARRAYIDX6_2]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 6
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 6
-; CHECK-PWR9-NEXT:    [[TMP11:%.*]] = load half, half* [[ADD_PTR_2]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 7
-; CHECK-PWR9-NEXT:    [[TMP12:%.*]] = load half, half* [[ARRAYIDX2_3]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_3:%.*]] = fmul half [[TMP11]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_3:%.*]] = fmul half [[TMP12]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_3]], half* [[ADD_PTR7_2]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 7
-; CHECK-PWR9-NEXT:    store half [[MUL4_3]], half* [[ARRAYIDX6_3]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 8
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 8
-; CHECK-PWR9-NEXT:    [[TMP13:%.*]] = load half, half* [[ADD_PTR_3]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 9
-; CHECK-PWR9-NEXT:    [[TMP14:%.*]] = load half, half* [[ARRAYIDX2_4]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_4:%.*]] = fmul half [[TMP13]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_4:%.*]] = fmul half [[TMP14]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_4]], half* [[ADD_PTR7_3]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 9
-; CHECK-PWR9-NEXT:    store half [[MUL4_4]], half* [[ARRAYIDX6_4]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 10
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 10
-; CHECK-PWR9-NEXT:    [[TMP15:%.*]] = load half, half* [[ADD_PTR_4]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 11
-; CHECK-PWR9-NEXT:    [[TMP16:%.*]] = load half, half* [[ARRAYIDX2_5]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_5:%.*]] = fmul half [[TMP15]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_5:%.*]] = fmul half [[TMP16]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_5]], half* [[ADD_PTR7_4]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 11
-; CHECK-PWR9-NEXT:    store half [[MUL4_5]], half* [[ARRAYIDX6_5]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 12
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 12
-; CHECK-PWR9-NEXT:    [[TMP17:%.*]] = load half, half* [[ADD_PTR_5]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 13
-; CHECK-PWR9-NEXT:    [[TMP18:%.*]] = load half, half* [[ARRAYIDX2_6]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_6:%.*]] = fmul half [[TMP17]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_6:%.*]] = fmul half [[TMP18]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_6]], half* [[ADD_PTR7_5]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 13
-; CHECK-PWR9-NEXT:    store half [[MUL4_6]], half* [[ARRAYIDX6_6]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 14
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 14
-; CHECK-PWR9-NEXT:    [[TMP19:%.*]] = load half, half* [[ADD_PTR_6]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 15
-; CHECK-PWR9-NEXT:    [[TMP20:%.*]] = load half, half* [[ARRAYIDX2_7]], align 2
-; CHECK-PWR9-NEXT:    [[MUL3_7:%.*]] = fmul half [[TMP19]], [[TMP0]]
-; CHECK-PWR9-NEXT:    [[MUL4_7:%.*]] = fmul half [[TMP20]], [[TMP0]]
-; CHECK-PWR9-NEXT:    store half [[MUL3_7]], half* [[ADD_PTR7_6]], align 2
-; CHECK-PWR9-NEXT:    [[ARRAYIDX6_7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 15
-; CHECK-PWR9-NEXT:    store half [[MUL4_7]], half* [[ARRAYIDX6_7]], align 2
-; CHECK-PWR9-NEXT:    [[ADD_PTR_7]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 16
-; CHECK-PWR9-NEXT:    [[ADD_PTR7_7]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 16
-; CHECK-PWR9-NEXT:    [[DEC_7]] = add nsw i32 [[BLKCNT_027]], -8
-; CHECK-PWR9-NEXT:    [[CMP_7:%.*]] = icmp eq i32 [[DEC_7]], 0
-; CHECK-PWR9-NEXT:    br i1 [[CMP_7]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK-PWR9:       while.end:
-; CHECK-PWR9-NEXT:    ret void
-;
+;CHECK-LABEL: fp16_
+;CHECK: LV(REG): VF = 1
+;CHECK: LV(REG): Found max usage: 2 item
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 4 registers
+;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 2 registers
 entry:
   %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
   %0 = bitcast i16 %tmp.0.extract.trunc to half

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
index 92b1de381767f..42eefcb53787f 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
@@ -1,150 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
 define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nsw i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, <4 x double>* [[TMP25]], align 8
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <4 x double>, <4 x double>* [[TMP27]], align 8
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, <4 x double>* [[TMP29]], align 8
-; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x double>, <4 x double>* [[TMP31]], align 8
-; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <4 x double>, <4 x double>* [[TMP33]], align 8
-; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <4 x double>, <4 x double>* [[TMP35]], align 8
-; CHECK-NEXT:    [[WIDE_VEC6:%.*]] = load <4 x double>, <4 x double>* [[TMP37]], align 8
-; CHECK-NEXT:    [[WIDE_VEC7:%.*]] = load <4 x double>, <4 x double>* [[TMP39]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <4 x double> [[WIDE_VEC1]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC3]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <4 x double> [[WIDE_VEC6]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <4 x double> [[WIDE_VEC7]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC15:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC16:%.*]] = shufflevector <4 x double> [[WIDE_VEC1]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC18:%.*]] = shufflevector <4 x double> [[WIDE_VEC3]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC19:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC20:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC21:%.*]] = shufflevector <4 x double> [[WIDE_VEC6]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[STRIDED_VEC22:%.*]] = shufflevector <4 x double> [[WIDE_VEC7]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP40:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC15]]
-; CHECK-NEXT:    [[TMP41:%.*]] = fadd <2 x double> [[STRIDED_VEC8]], [[STRIDED_VEC16]]
-; CHECK-NEXT:    [[TMP42:%.*]] = fadd <2 x double> [[STRIDED_VEC9]], [[STRIDED_VEC17]]
-; CHECK-NEXT:    [[TMP43:%.*]] = fadd <2 x double> [[STRIDED_VEC10]], [[STRIDED_VEC18]]
-; CHECK-NEXT:    [[TMP44:%.*]] = fadd <2 x double> [[STRIDED_VEC11]], [[STRIDED_VEC19]]
-; CHECK-NEXT:    [[TMP45:%.*]] = fadd <2 x double> [[STRIDED_VEC12]], [[STRIDED_VEC20]]
-; CHECK-NEXT:    [[TMP46:%.*]] = fadd <2 x double> [[STRIDED_VEC13]], [[STRIDED_VEC21]]
-; CHECK-NEXT:    [[TMP47:%.*]] = fadd <2 x double> [[STRIDED_VEC14]], [[STRIDED_VEC22]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP40]], <2 x double>* [[TMP57]], align 8
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 2
-; CHECK-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP41]], <2 x double>* [[TMP59]], align 8
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 4
-; CHECK-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP42]], <2 x double>* [[TMP61]], align 8
-; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 6
-; CHECK-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP43]], <2 x double>* [[TMP63]], align 8
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 8
-; CHECK-NEXT:    [[TMP65:%.*]] = bitcast double* [[TMP64]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP44]], <2 x double>* [[TMP65]], align 8
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 10
-; CHECK-NEXT:    [[TMP67:%.*]] = bitcast double* [[TMP66]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP45]], <2 x double>* [[TMP67]], align 8
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 12
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast double* [[TMP68]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP46]], <2 x double>* [[TMP69]], align 8
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 14
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast double* [[TMP70]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP47]], <2 x double>* [[TMP71]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP72]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP73:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ODD_IDX:%.*]] = add nsw i64 [[TMP73]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP73]]
-; CHECK-NEXT:    [[ARRAYIDX_ODD:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[ODD_IDX]]
-; CHECK-NEXT:    [[TMP74:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TMP75:%.*]] = load double, double* [[ARRAYIDX_ODD]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP74]], [[TMP75]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[ADD]], double* [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
 entry:
   br label %for.body
 
+; CHECK-LABEL: @foo
+; CHECK: <2 x double>
 
 for.cond.cleanup:                                 ; preds = %for.body
   ret void

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
index 723241ca07383..94fc4dfc16e1c 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
@@ -1,170 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
 ; Function Attrs: nounwind
 define zeroext i32 @test() #0 {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [1600 x i32], align 4
-; CHECK-NEXT:    [[C:%.*]] = alloca [1600 x i32], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [1600 x i32]* [[A]] to i8*
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 6400, i8* [[TMP0]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i32 [[TMP1]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[INDUCTION2:%.*]] = add i32 [[TMP1]], 2
-; CHECK-NEXT:    [[INDUCTION3:%.*]] = add i32 [[TMP1]], 3
-; CHECK-NEXT:    [[INDUCTION4:%.*]] = add i32 [[TMP1]], 4
-; CHECK-NEXT:    [[INDUCTION5:%.*]] = add i32 [[TMP1]], 5
-; CHECK-NEXT:    [[INDUCTION6:%.*]] = add i32 [[TMP1]], 6
-; CHECK-NEXT:    [[INDUCTION7:%.*]] = add i32 [[TMP1]], 7
-; CHECK-NEXT:    [[INDUCTION8:%.*]] = add i32 [[TMP1]], 8
-; CHECK-NEXT:    [[INDUCTION9:%.*]] = add i32 [[TMP1]], 9
-; CHECK-NEXT:    [[INDUCTION10:%.*]] = add i32 [[TMP1]], 10
-; CHECK-NEXT:    [[INDUCTION11:%.*]] = add i32 [[TMP1]], 11
-; CHECK-NEXT:    [[INDUCTION12:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION13:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[INDUCTION14:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[INDUCTION15:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[INDUCTION16:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[INDUCTION17:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[INDUCTION18:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[INDUCTION19:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[INDUCTION20:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[INDUCTION21:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT:    [[INDUCTION22:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[INDUCTION23:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION12]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION13]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION14]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION15]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION16]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION17]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION18]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION19]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION20]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION21]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION22]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION23]]
-; CHECK-NEXT:    store i32 [[INDUCTION]], i32* [[TMP2]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION1]], i32* [[TMP3]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION2]], i32* [[TMP4]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION3]], i32* [[TMP5]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION4]], i32* [[TMP6]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION5]], i32* [[TMP7]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION6]], i32* [[TMP8]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION7]], i32* [[TMP9]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION8]], i32* [[TMP10]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION9]], i32* [[TMP11]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION10]], i32* [[TMP12]], align 4
-; CHECK-NEXT:    store i32 [[INDUCTION11]], i32* [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1596
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, 1596
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1596, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast [1600 x i32]* [[C]] to i8*
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 6400, i8* [[TMP15]]) #[[ATTR3]]
-; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 0
-; CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 0
-; CHECK-NEXT:    [[CALL:%.*]] = call signext i32 @bar(i32* [[ARRAYDECAY]], i32* [[ARRAYDECAY1]]) #[[ATTR3]]
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH25:%.*]], label [[VECTOR_PH27:%.*]]
-; CHECK:       vector.ph27:
-; CHECK-NEXT:    br label [[VECTOR_BODY26:%.*]]
-; CHECK:       vector.body26:
-; CHECK-NEXT:    [[INDEX30:%.*]] = phi i64 [ 0, [[VECTOR_PH27]] ], [ [[INDEX_NEXT46:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP32:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI31:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP33:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP34:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI33:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP35:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI34:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP36:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI35:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP37:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI36:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP38:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[VEC_PHI37:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP39:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT:    [[INDUCTION38:%.*]] = add i64 [[INDEX30]], 0
-; CHECK-NEXT:    [[INDUCTION39:%.*]] = add i64 [[INDEX30]], 1
-; CHECK-NEXT:    [[INDUCTION40:%.*]] = add i64 [[INDEX30]], 2
-; CHECK-NEXT:    [[INDUCTION41:%.*]] = add i64 [[INDEX30]], 3
-; CHECK-NEXT:    [[INDUCTION42:%.*]] = add i64 [[INDEX30]], 4
-; CHECK-NEXT:    [[INDUCTION43:%.*]] = add i64 [[INDEX30]], 5
-; CHECK-NEXT:    [[INDUCTION44:%.*]] = add i64 [[INDEX30]], 6
-; CHECK-NEXT:    [[INDUCTION45:%.*]] = add i64 [[INDEX30]], 7
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION38]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION39]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION40]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION41]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION42]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION43]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION44]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION45]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP32]] = add i32 [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP33]] = add i32 [[TMP25]], [[VEC_PHI31]]
-; CHECK-NEXT:    [[TMP34]] = add i32 [[TMP26]], [[VEC_PHI32]]
-; CHECK-NEXT:    [[TMP35]] = add i32 [[TMP27]], [[VEC_PHI33]]
-; CHECK-NEXT:    [[TMP36]] = add i32 [[TMP28]], [[VEC_PHI34]]
-; CHECK-NEXT:    [[TMP37]] = add i32 [[TMP29]], [[VEC_PHI35]]
-; CHECK-NEXT:    [[TMP38]] = add i32 [[TMP30]], [[VEC_PHI36]]
-; CHECK-NEXT:    [[TMP39]] = add i32 [[TMP31]], [[VEC_PHI37]]
-; CHECK-NEXT:    [[INDEX_NEXT46]] = add nuw i64 [[INDEX30]], 8
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT46]], 1600
-; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK24:%.*]], label [[VECTOR_BODY26]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       middle.block24:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP33]], [[TMP32]]
-; CHECK-NEXT:    [[BIN_RDX47:%.*]] = add i32 [[TMP34]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX48:%.*]] = add i32 [[TMP35]], [[BIN_RDX47]]
-; CHECK-NEXT:    [[BIN_RDX49:%.*]] = add i32 [[TMP36]], [[BIN_RDX48]]
-; CHECK-NEXT:    [[BIN_RDX50:%.*]] = add i32 [[TMP37]], [[BIN_RDX49]]
-; CHECK-NEXT:    [[BIN_RDX51:%.*]] = add i32 [[TMP38]], [[BIN_RDX50]]
-; CHECK-NEXT:    [[BIN_RDX52:%.*]] = add i32 [[TMP39]], [[BIN_RDX51]]
-; CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_COND_CLEANUP5:%.*]], label [[SCALAR_PH25]]
-; CHECK:       scalar.ph25:
-; CHECK-NEXT:    [[BC_RESUME_VAL28:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK24]] ], [ 0, [[FOR_COND_CLEANUP]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP]] ], [ [[BIN_RDX52]], [[MIDDLE_BLOCK24]] ]
-; CHECK-NEXT:    br label [[FOR_BODY6:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV25:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT26:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDVARS_IV25]]
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i64 [[INDVARS_IV25]] to i32
-; CHECK-NEXT:    store i32 [[TMP41]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT26]] = add nuw nsw i64 [[INDVARS_IV25]], 1
-; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT26]], 1600
-; CHECK-NEXT:    br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.cond.cleanup5:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY6]] ], [ [[BIN_RDX52]], [[MIDDLE_BLOCK24]] ]
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 6400, i8* nonnull [[TMP15]]) #[[ATTR3]]
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 6400, i8* [[TMP0]]) #[[ATTR3]]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
-; CHECK:       for.body6:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL28]], [[SCALAR_PH25]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY6]] ]
-; CHECK-NEXT:    [[S_022:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH25]] ], [ [[ADD]], [[FOR_BODY6]] ]
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[ADD]] = add i32 [[TMP42]], [[S_022]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP5]], label [[FOR_BODY6]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK-LABEL: @test
+; CHECK-NOT: x i32>
 
 entry:
   %a = alloca [1600 x i32], align 4

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
index c82c58cd718b4..15aec0d3539f4 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -loop-vectorize -instcombine -S | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
@@ -9,51 +8,6 @@ target triple = "powerpc64-unknown-linux-gnu"
 @ntimes = external hidden unnamed_addr global i32, align 4
 
 define signext i32 @s173() #0 {
-; CHECK-LABEL: @s173(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @ntimes, align 4
-; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_END12:%.*]]
-; CHECK:       for.cond1.preheader.preheader:
-; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
-; CHECK:       for.cond1.preheader:
-; CHECK-NEXT:    [[NL_022:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDEX]], 16000
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP8]], align 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; CHECK:       for.body3:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY3]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[INC11]] = add nuw nsw i32 [[NL_022]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* @ntimes, align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 10
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC11]], [[MUL]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[FOR_END12_LOOPEXIT:%.*]]
-; CHECK:       for.end12.loopexit:
-; CHECK-NEXT:    br label [[FOR_END12]]
-; CHECK:       for.end12:
-; CHECK-NEXT:    ret i32 0
-;
 entry:
   %0 = load i32, i32* @ntimes, align 4
   %cmp21 = icmp sgt i32 %0, 0
@@ -87,6 +41,10 @@ for.end:                                          ; preds = %for.body3
 for.end12:                                        ; preds = %for.end, %entry
   ret i32 0
 
+; CHECK-LABEL: @s173
+; CHECK: load <4 x float>, <4 x float>*
+; CHECK: add nsw i64 %index, 16000
+; CHECK: ret i32 0
 }
 
 attributes #0 = { nounwind }

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
index 62a4582421d28..fec0b66127039 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -loop-vectorize -mtriple riscv64-linux-gnu \
 ; RUN:   -mattr=+v,+d -debug-only=loop-vectorize \
@@ -19,336 +18,26 @@
 
 define void @add(float* noalias nocapture readonly %src1, float* noalias nocapture readonly %src2, i32 signext %size, float* noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
-; CHECK-LMUL1-LABEL: @add(
-; CHECK-LMUL1-NEXT:  entry:
-; CHECK-LMUL1-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL1-NEXT:    [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL1-NEXT:    br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-LMUL1:       for.body.preheader:
-; CHECK-LMUL1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 8
-; CHECK-LMUL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL1:       vector.ph:
-; CHECK-LMUL1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 8
-; CHECK-LMUL1-NEXT:    [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL1:       vector.body:
-; CHECK-LMUL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-LMUL1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-LMUL1-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-LMUL1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-LMUL1-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-LMUL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-LMUL1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL1-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-LMUL1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4
-; CHECK-LMUL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 4
-; CHECK-LMUL1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-LMUL1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP13]], align 4
-; CHECK-LMUL1-NEXT:    [[TMP14:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-LMUL1-NEXT:    [[TMP15:%.*]] = fadd <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-LMUL1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP1]]
-; CHECK-LMUL1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL1-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <4 x float>*
-; CHECK-LMUL1-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP19]], align 4
-; CHECK-LMUL1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 4
-; CHECK-LMUL1-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <4 x float>*
-; CHECK-LMUL1-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP21]], align 4
-; CHECK-LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-LMUL1-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL1-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL1:       middle.block:
-; CHECK-LMUL1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-LMUL1:       scalar.ph:
-; CHECK-LMUL1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-LMUL1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-LMUL1:       for.cond.cleanup.loopexit:
-; CHECK-LMUL1-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL1:       for.cond.cleanup:
-; CHECK-LMUL1-NEXT:    ret void
-; CHECK-LMUL1:       for.body:
-; CHECK-LMUL1-NEXT:    [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-LMUL1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL1-NEXT:    [[TMP23:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL1-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL1-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], [[TMP24]]
-; CHECK-LMUL1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL1-NEXT:    store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL1-NEXT:    [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; CHECK-LMUL2-LABEL: @add(
-; CHECK-LMUL2-NEXT:  entry:
-; CHECK-LMUL2-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL2-NEXT:    [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL2-NEXT:    br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-LMUL2:       for.body.preheader:
-; CHECK-LMUL2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 16
-; CHECK-LMUL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL2:       vector.ph:
-; CHECK-LMUL2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 16
-; CHECK-LMUL2-NEXT:    [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL2:       vector.body:
-; CHECK-LMUL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-LMUL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-LMUL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-LMUL2-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP5]], align 4
-; CHECK-LMUL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 8
-; CHECK-LMUL2-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-LMUL2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4
-; CHECK-LMUL2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-LMUL2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL2-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-LMUL2-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-LMUL2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 8
-; CHECK-LMUL2-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-LMUL2-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-LMUL2-NEXT:    [[TMP14:%.*]] = fadd <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-LMUL2-NEXT:    [[TMP15:%.*]] = fadd <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-LMUL2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP1]]
-; CHECK-LMUL2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
-; CHECK-LMUL2-NEXT:    store <8 x float> [[TMP14]], <8 x float>* [[TMP19]], align 4
-; CHECK-LMUL2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-LMUL2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-LMUL2-NEXT:    store <8 x float> [[TMP15]], <8 x float>* [[TMP21]], align 4
-; CHECK-LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-LMUL2-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL2-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL2:       middle.block:
-; CHECK-LMUL2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-LMUL2:       scalar.ph:
-; CHECK-LMUL2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-LMUL2-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-LMUL2:       for.cond.cleanup.loopexit:
-; CHECK-LMUL2-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL2:       for.cond.cleanup:
-; CHECK-LMUL2-NEXT:    ret void
-; CHECK-LMUL2:       for.body:
-; CHECK-LMUL2-NEXT:    [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-LMUL2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL2-NEXT:    [[TMP23:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL2-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL2-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], [[TMP24]]
-; CHECK-LMUL2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL2-NEXT:    store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL2-NEXT:    [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; CHECK-LMUL4-LABEL: @add(
-; CHECK-LMUL4-NEXT:  entry:
-; CHECK-LMUL4-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL4-NEXT:    [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL4-NEXT:    br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]]
-; CHECK-LMUL4:       iter.check:
-; CHECK-LMUL4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 8
-; CHECK-LMUL4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-LMUL4:       vector.main.loop.iter.check:
-; CHECK-LMUL4-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[CONV]], 32
-; CHECK-LMUL4-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL4:       vector.ph:
-; CHECK-LMUL4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 32
-; CHECK-LMUL4-NEXT:    [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL4-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL4:       vector.body:
-; CHECK-LMUL4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
-; CHECK-LMUL4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-LMUL4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-LMUL4-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <16 x float>*
-; CHECK-LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP5]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 16
-; CHECK-LMUL4-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <16 x float>*
-; CHECK-LMUL4-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x float>, <16 x float>* [[TMP7]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-LMUL4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL4-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <16 x float>*
-; CHECK-LMUL4-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x float>, <16 x float>* [[TMP11]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 16
-; CHECK-LMUL4-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <16 x float>*
-; CHECK-LMUL4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x float>, <16 x float>* [[TMP13]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP14:%.*]] = fadd <16 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
-; CHECK-LMUL4-NEXT:    [[TMP15:%.*]] = fadd <16 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
-; CHECK-LMUL4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP1]]
-; CHECK-LMUL4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL4-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
-; CHECK-LMUL4-NEXT:    store <16 x float> [[TMP14]], <16 x float>* [[TMP19]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-LMUL4-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>*
-; CHECK-LMUL4-NEXT:    store <16 x float> [[TMP15]], <16 x float>* [[TMP21]], align 4
-; CHECK-LMUL4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-LMUL4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL4:       middle.block:
-; CHECK-LMUL4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL4-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-LMUL4:       vec.epilog.iter.check:
-; CHECK-LMUL4-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL4-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-LMUL4-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-LMUL4:       vec.epilog.ph:
-; CHECK-LMUL4-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-LMUL4-NEXT:    [[N_MOD_VF5:%.*]] = urem i64 [[CONV]], 8
-; CHECK-LMUL4-NEXT:    [[N_VEC6:%.*]] = sub i64 [[CONV]], [[N_MOD_VF5]]
-; CHECK-LMUL4-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-LMUL4:       vec.epilog.vector.body:
-; CHECK-LMUL4-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-LMUL4-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-LMUL4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP23]]
-; CHECK-LMUL4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
-; CHECK-LMUL4-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <8 x float>*
-; CHECK-LMUL4-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP26]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP23]]
-; CHECK-LMUL4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP27]], i32 0
-; CHECK-LMUL4-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
-; CHECK-LMUL4-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP29]], align 4
-; CHECK-LMUL4-NEXT:    [[TMP30:%.*]] = fadd <8 x float> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
-; CHECK-LMUL4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP23]]
-; CHECK-LMUL4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP31]], i32 0
-; CHECK-LMUL4-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
-; CHECK-LMUL4-NEXT:    store <8 x float> [[TMP30]], <8 x float>* [[TMP33]], align 4
-; CHECK-LMUL4-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-LMUL4-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
-; CHECK-LMUL4-NEXT:    br i1 [[TMP34]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-LMUL4:       vec.epilog.middle.block:
-; CHECK-LMUL4-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[CONV]], [[N_VEC6]]
-; CHECK-LMUL4-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-LMUL4:       vec.epilog.scalar.ph:
-; CHECK-LMUL4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-LMUL4-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-LMUL4:       for.cond.cleanup.loopexit.loopexit:
-; CHECK-LMUL4-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
-; CHECK-LMUL4:       for.cond.cleanup.loopexit:
-; CHECK-LMUL4-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL4:       for.cond.cleanup:
-; CHECK-LMUL4-NEXT:    ret void
-; CHECK-LMUL4:       for.body:
-; CHECK-LMUL4-NEXT:    [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-LMUL4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL4-NEXT:    [[TMP35:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL4-NEXT:    [[TMP36:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL4-NEXT:    [[ADD:%.*]] = fadd float [[TMP35]], [[TMP36]]
-; CHECK-LMUL4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL4-NEXT:    store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL4-NEXT:    [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; CHECK-LMUL8-LABEL: @add(
-; CHECK-LMUL8-NEXT:  entry:
-; CHECK-LMUL8-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL8-NEXT:    [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL8-NEXT:    br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]]
-; CHECK-LMUL8:       iter.check:
-; CHECK-LMUL8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 16
-; CHECK-LMUL8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-LMUL8:       vector.main.loop.iter.check:
-; CHECK-LMUL8-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[CONV]], 32
-; CHECK-LMUL8-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL8:       vector.ph:
-; CHECK-LMUL8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 32
-; CHECK-LMUL8-NEXT:    [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL8-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL8:       vector.body:
-; CHECK-LMUL8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL8-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-LMUL8-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <32 x float>*
-; CHECK-LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x float>, <32 x float>* [[TMP3]], align 4
-; CHECK-LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-LMUL8-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <32 x float>*
-; CHECK-LMUL8-NEXT:    [[WIDE_LOAD2:%.*]] = load <32 x float>, <32 x float>* [[TMP6]], align 4
-; CHECK-LMUL8-NEXT:    [[TMP7:%.*]] = fadd <32 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-LMUL8-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL8-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <32 x float>*
-; CHECK-LMUL8-NEXT:    store <32 x float> [[TMP7]], <32 x float>* [[TMP10]], align 4
-; CHECK-LMUL8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-LMUL8-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL8-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL8:       middle.block:
-; CHECK-LMUL8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL8-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-LMUL8:       vec.epilog.iter.check:
-; CHECK-LMUL8-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 16
-; CHECK-LMUL8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-LMUL8:       vec.epilog.ph:
-; CHECK-LMUL8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-LMUL8-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[CONV]], 16
-; CHECK-LMUL8-NEXT:    [[N_VEC4:%.*]] = sub i64 [[CONV]], [[N_MOD_VF3]]
-; CHECK-LMUL8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-LMUL8:       vec.epilog.vector.body:
-; CHECK-LMUL8-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-LMUL8-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP12]]
-; CHECK-LMUL8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-LMUL8-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>*
-; CHECK-LMUL8-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x float>, <16 x float>* [[TMP15]], align 4
-; CHECK-LMUL8-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP12]]
-; CHECK-LMUL8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL8-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <16 x float>*
-; CHECK-LMUL8-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x float>, <16 x float>* [[TMP18]], align 4
-; CHECK-LMUL8-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
-; CHECK-LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP12]]
-; CHECK-LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-LMUL8-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <16 x float>*
-; CHECK-LMUL8-NEXT:    store <16 x float> [[TMP19]], <16 x float>* [[TMP22]], align 4
-; CHECK-LMUL8-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 16
-; CHECK-LMUL8-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-LMUL8-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-LMUL8:       vec.epilog.middle.block:
-; CHECK-LMUL8-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[CONV]], [[N_VEC4]]
-; CHECK-LMUL8-NEXT:    br i1 [[CMP_N5]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-LMUL8:       vec.epilog.scalar.ph:
-; CHECK-LMUL8-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-LMUL8-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-LMUL8:       for.cond.cleanup.loopexit.loopexit:
-; CHECK-LMUL8-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
-; CHECK-LMUL8:       for.cond.cleanup.loopexit:
-; CHECK-LMUL8-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL8:       for.cond.cleanup:
-; CHECK-LMUL8-NEXT:    ret void
-; CHECK-LMUL8:       for.body:
-; CHECK-LMUL8-NEXT:    [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL8-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL8-NEXT:    [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL8-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP25]]
-; CHECK-LMUL8-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL8-NEXT:    store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL8-NEXT:    [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL8-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL8-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK-LMUL1:      LV(REG): Found max usage: 2 item
+; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; CHECK-LMUL2:      LV(REG): Found max usage: 2 item
+; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 4 registers
+; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 4 registers
+; CHECK-LMUL4:      LV(REG): Found max usage: 2 item
+; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers
+; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers
+; CHECK-LMUL8:      LV(REG): Found max usage: 2 item
+; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 16 registers
+; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 16 registers
 
 entry:
   %conv = zext i32 %size to i64

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index 27807c4c612e6..2a529cc653d6f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on \
 ; RUN:   -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \
 ; RUN:   -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \
@@ -12,67 +11,15 @@
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @add(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = add <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @add
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ADD1:.*]] = add <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = add <vscale x 8 x i32> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %[[ADD]])
 entry:
   br label %for.body
 
@@ -94,67 +41,15 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @or(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = or <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = or <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[OR]] = or i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
-;
+; CHECK-LABEL: @or
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[OR1:.*]] = or <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[OR2:.*]] = or <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[OR:.*]] = or <vscale x 8 x i32> %[[OR2]], %[[OR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> %[[OR]])
 entry:
   br label %for.body
 
@@ -176,67 +71,15 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @and(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = and <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = and <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND]] = and i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[AND_LCSSA]]
-;
+; CHECK-LABEL: @and
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[AND1:.*]] = and <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[AND2:.*]] = and <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ABD:.*]] = and <vscale x 8 x i32> %[[ADD2]], %[[AND1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %[[ADD]])
 entry:
   br label %for.body
 
@@ -258,67 +101,15 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @xor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = xor <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = xor <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[XOR]] = xor i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
-;
+; CHECK-LABEL: @xor
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[XOR1:.*]] = xor <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[XOR2:.*]] = xor <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[XOR:.*]] = xor <vscale x 8 x i32> %[[XOR2]], %[[XOR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %[[XOR]])
 entry:
   br label %for.body
 
@@ -340,71 +131,18 @@ for.end:                                 ; preds = %for.body, %entry
 ; SMIN
 
 define i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @smin(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @smin
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
 entry:
   br label %for.body
 
@@ -427,71 +165,18 @@ for.end:
 ; UMAX
 
 define i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @umax(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @umax
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
 entry:
   br label %for.body
 
@@ -514,67 +199,15 @@ for.end:
 ; FADD (FAST)
 
 define float @fadd_fast(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> zeroinitializer, float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <vscale x 8 x float> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP24]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[ADD1:.*]] = fadd fast <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
 entry:
   br label %for.body
 
@@ -595,55 +228,15 @@ for.end:
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast_bfloat(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds bfloat, bfloat* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast bfloat* [[TMP4]] to <8 x bfloat>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 8
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast bfloat* [[TMP6]] to <8 x bfloat>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = fadd fast <8 x bfloat> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9]] = fadd fast <8 x bfloat> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x bfloat> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load bfloat, bfloat* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast bfloat [[TMP12]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi bfloat [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret bfloat [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast_bfloat
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <8 x bfloat>
+; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
 entry:
   br label %for.body
 
@@ -665,71 +258,18 @@ for.end:
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define float @fmin_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmin_fast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp olt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %[[SEL]])
 entry:
   br label %for.body
 
@@ -752,71 +292,18 @@ for.end:
 
 ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
 define float @fmax_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmax_fast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select fast <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast ogt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %[[SEL]])
 entry:
   br label %for.body
 
@@ -842,55 +329,15 @@ for.end:
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
 define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = mul <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9]] = mul <4 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[TMP12]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @mul
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
 entry:
   br label %for.body
 
@@ -912,81 +359,19 @@ for.end:                                 ; preds = %for.body, %entry
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
 define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @memory_dependence(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 4
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP24]] = mul <4 x i32> [[WIDE_LOAD3]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP25]] = mul <4 x i32> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP25]], [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i64 [[I]], 32
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD2]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[TMP29]], [[SUM]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @memory_dependence
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[LOAD3:.*]] = load <4 x i32>
+; CHECK: %[[LOAD4:.*]] = load <4 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index 577c18c2740bd..8977322610d1a 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -61,7 +61,7 @@ define void @func_21() {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
@@ -79,7 +79,7 @@ define void @func_21() {
 ; CHECK-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[B_PTR]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index ad28b9bda252d..6dd461de644b8 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -1,51 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
 
 define i32 @main(i32 %arg, i8** nocapture readnone %arg1) #0 {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 5
-; CHECK-NEXT:    [[TMP7:%.*]] = add i8 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[OFFSET_IDX]], 7
-; CHECK-NEXT:    store i8 [[TMP1]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP2]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP4]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP5]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP6]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP7]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    store i8 [[TMP8]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 9, 8
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[RET:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[STOREMERGE_I_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP12_I_I:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    store i8 [[STOREMERGE_I_I]], i8* [[TMP0]], align 2
-; CHECK-NEXT:    [[TMP8_I_I:%.*]] = icmp ult i8 [[STOREMERGE_I_I]], 8
-; CHECK-NEXT:    [[TMP12_I_I]] = add nuw nsw i8 [[STOREMERGE_I_I]], 1
-; CHECK-NEXT:    br i1 [[TMP8_I_I]], label [[LOOP]], label [[RET]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       ret:
-; CHECK-NEXT:    ret i32 0
-;
+;CHECK: vector.body:
 entry:
   %0 = alloca i8, align 1
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
index 2569d4abaddb6..8281bf1803533 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -passes='default<O3>' -S 2>&1 | FileCheck %s
 ; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
@@ -13,45 +12,14 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind readonly uwtable
 define i32 @vect() {
-; CHECK-LABEL: @vect(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 248
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 248, [[MIDDLE_BLOCK]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[RED_05:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP8]], [[RED_05]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 255
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[ADD]]
-;
+; CHECK: LV: Checking a loop in 'vect'
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
 ; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
@@ -62,10 +30,18 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 ; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body{{.*}}, !llvm.loop [[scalar:![0-9]+]]
 
 for.end:                                          ; preds = %for.body
   ret i32 %add
 }
 
 ; Now, we check for the Hint metadata
+; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]}
+; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]}
+; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 44b219234e24c..90377a0500a66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -27,7 +27,7 @@ define void @f1() {
 ; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0), i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2, 2
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
@@ -44,7 +44,7 @@ define void @f1() {
 ; CHECK-NEXT:    store i16* [[_TMP4]], i16** [[_TMP7]], align 8
 ; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
 ; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 721209e22334f..492a8d8242c69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -21,15 +21,15 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <32 x i8>*
-; CHECK-NEXT:    store <32 x i8> [[VEC_IND]], <32 x i8>* [[TMP8]], align 1
+; CHECK-NEXT:    store <32 x i8> [[VEC_IND1]], <32 x i8>* [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <32 x i8> [[VEC_IND1]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -92,37 +92,37 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 ; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <2 x i64> [[STEP_ADD1]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 9, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float>
-; CHECK-NEXT:    [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float>
-; CHECK-NEXT:    [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float>
-; CHECK-NEXT:    [[TMP16:%.*]] = sitofp <2 x i64> [[TMP12]] to <2 x float>
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP13]], <2 x float>* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP23]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP14]], <2 x float>* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = sitofp <2 x i64> [[TMP13]] to <2 x float>
+; CHECK-NEXT:    [[TMP18:%.*]] = sitofp <2 x i64> [[TMP14]] to <2 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = sitofp <2 x i64> [[TMP15]] to <2 x float>
+; CHECK-NEXT:    [[TMP20:%.*]] = sitofp <2 x i64> [[TMP16]] to <2 x float>
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP15]], <2 x float>* [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 6
+; CHECK-NEXT:    store <2 x float> [[TMP17]], <2 x float>* [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP16]], <2 x float>* [[TMP28]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP18]], <2 x float>* [[TMP28]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 4
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP19]], <2 x float>* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 6
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast float* [[TMP31]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP20]], <2 x float>* [[TMP32]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
index 213d158ab8c75..779a482c582a6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -29,58 +29,58 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT2]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw <4 x i32> [[TMP4]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT3]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT5]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw <4 x i32> [[TMP11]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP13]], [[TMP7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i8> poison, i8 [[TMP16]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT7]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP17]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT8]] to <4 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP18]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i32> [[TMP19]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP22]], i32 0
-; CHECK-NEXT:    store i32 [[TMP24]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP22]], i32 1
-; CHECK-NEXT:    store i32 [[TMP25]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2
-; CHECK-NEXT:    store i32 [[TMP26]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3
-; CHECK-NEXT:    store i32 [[TMP27]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
-; CHECK-NEXT:    store i32 [[TMP28]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw <4 x i32> [[TMP8]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw <4 x i32> [[TMP9]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT4]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[TMP13]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw <4 x i32> [[TMP14]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw <4 x i32> [[TMP15]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP16]], [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <4 x i32> [[TMP17]], [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP20]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP21]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = or <4 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i32> [[TMP23]], [[TMP25]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i32 0
+; CHECK-NEXT:    store i32 [[TMP28]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP29]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP30]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP26]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP31]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP27]], i32 0
+; CHECK-NEXT:    store i32 [[TMP32]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP27]], i32 1
+; CHECK-NEXT:    store i32 [[TMP33]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP27]], i32 2
+; CHECK-NEXT:    store i32 [[TMP34]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3
+; CHECK-NEXT:    store i32 [[TMP35]], i32* undef, align 4, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
@@ -91,14 +91,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
 ; CHECK-NEXT:    [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X]] to i32
 ; CHECK-NEXT:    [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP37]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
-; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP38]] to i32
 ; CHECK-NEXT:    [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
 ; CHECK-NEXT:    store i32 [[OR83]], i32* undef, align 4, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, i8* [[P_359]], i64 4

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index fbb59ceb5b990..5023d8f1ff4b5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
 
 ; Make sure that integer poison-generating flags (i.e., nuw/nsw, exact and inbounds)
@@ -20,57 +19,19 @@ target triple = "x86_64-pc-linux-gnu"
 ; Drop poison-generating flags from 'sub' and 'getelementptr' feeding a masked load.
 ; Test for PR52111.
 define void @drop_scalar_nuw_nsw(float* noalias nocapture readonly %input,
+                                 float* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @drop_scalar_nuw_nsw(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
+; CHECK:         [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I27]]
-; CHECK-NEXT:    [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
-  float* %output) local_unnamed_addr #0 {
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
 entry:
   br label %loop.header
 
@@ -100,57 +61,19 @@ loop.exit:
 ; Drop poison-generating flags from 'sub' and 'getelementptr' feeding a masked load.
 ; In this case, 'sub' and 'getelementptr' are not guarded by the predicate.
 define void @drop_nonpred_scalar_nuw_nsw(float* noalias nocapture readonly %input,
+                                         float* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @drop_nonpred_scalar_nuw_nsw(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
+; CHECK:         [[TMP5:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I27:%.*]] = sub i64 [[IV]], 1
-; CHECK-NEXT:    [[I29:%.*]] = getelementptr float, float* [[INPUT]], i64 [[I27]]
-; CHECK-NEXT:    [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
-  float* %output) local_unnamed_addr #0 {
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
 entry:
   br label %loop.header
 
@@ -179,57 +102,18 @@ loop.exit:
 
 ; Preserve poison-generating flags from vector 'sub', 'mul' and 'getelementptr' feeding a masked gather.
 define void @preserve_vector_nuw_nsw(float* noalias nocapture readonly %input,
+                                     float* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @preserve_vector_nuw_nsw(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw <4 x i64> [[TMP2]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP4]], i32 4, <4 x i1> [[TMP5]], <4 x float> undef), !invariant.load !0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[I28:%.*]] = mul nuw nsw i64 [[I27]], 2
-; CHECK-NEXT:    [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I28]]
-; CHECK-NEXT:    [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
-  float* %output) local_unnamed_addr #0 {
+; CHECK:         [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i64> [[TMP5]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP7]], i32 4, <4 x i1> [[TMP8]], <4 x float> undef), !invariant.load !0
 entry:
   br label %loop.header
 
@@ -259,64 +143,21 @@ loop.exit:
 
 ; Drop poison-generating flags from vector 'sub' and 'gep' feeding a masked load.
 define void @drop_vector_nuw_nsw(float* noalias nocapture readonly %input,
+                                 float* %output, float** noalias %ptrs) local_unnamed_addr #0 {
 ; CHECK-LABEL: @drop_vector_nuw_nsw(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float*, float** [[PTRS:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[INPUT:%.*]], <4 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float*, float** [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float** [[TMP5]] to <4 x float*>*
-; CHECK-NEXT:    store <4 x float*> [[TMP4]], <4 x float*>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float*> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP10]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK:         [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float*, float** [[PTRS:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[INPUT:%.*]], <4 x i64> [[TMP6]]
+; CHECK:         [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float*> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float*, float** [[PTRS]], i64 [[IV]]
-; CHECK-NEXT:    [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I27]]
-; CHECK-NEXT:    store float* [[I29]], float** [[GEP]], align 8
-; CHECK-NEXT:    br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
-  float* %output, float** noalias %ptrs) local_unnamed_addr #0 {
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP13]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0
 entry:
   br label %loop.header
 
@@ -349,49 +190,18 @@ loop.exit:
 ; of any masked load/store/gather/scatter.
 define void @preserve_nuw_nsw_no_addr(i64* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @preserve_nuw_nsw_no_addr(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[I27]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[I34]], i64* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP9]], align 4
 entry:
   br label %loop.header
 
@@ -418,63 +228,22 @@ loop.exit:
 
 ; Drop poison-generating flags from 'sdiv' and 'getelementptr' feeding a masked load.
 define void @drop_scalar_exact(float* noalias nocapture readonly %input,
+                               float* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @drop_scalar_exact(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
+; CHECK:         [[TMP4:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP12]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I7:%.*]] = icmp ne i64 [[IV]], 0
-; CHECK-NEXT:    [[I8:%.*]] = and i64 [[IV]], 1
-; CHECK-NEXT:    [[I9:%.*]] = icmp eq i64 [[I8]], 0
-; CHECK-NEXT:    [[I10:%.*]] = and i1 [[I7]], [[I9]]
-; CHECK-NEXT:    br i1 [[I10]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I26:%.*]] = sdiv exact i64 [[IV]], 1
-; CHECK-NEXT:    [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I26]]
-; CHECK-NEXT:    [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
-  float* %output) local_unnamed_addr #0 {
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP12]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0
 entry:
   br label %loop.header
 
@@ -506,61 +275,21 @@ loop.exit:
 
 ; Preserve poison-generating flags from 'sdiv' and 'getelementptr' feeding a masked gather.
 define void @preserve_vector_exact_no_addr(float* noalias nocapture readonly %input,
+                                           float* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @preserve_vector_exact_no_addr(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP6]], i32 4, <4 x i1> [[TMP7]], <4 x float> undef), !invariant.load !0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I7:%.*]] = icmp ne i64 [[IV]], 0
-; CHECK-NEXT:    [[I8:%.*]] = and i64 [[IV]], 1
-; CHECK-NEXT:    [[I9:%.*]] = icmp eq i64 [[I8]], 0
-; CHECK-NEXT:    [[I10:%.*]] = and i1 [[I7]], [[I9]]
-; CHECK-NEXT:    br i1 [[I10]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I26:%.*]] = sdiv exact i64 [[IV]], 2
-; CHECK-NEXT:    [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I26]]
-; CHECK-NEXT:    [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
+; CHECK:         [[TMP4:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> [[TMP10]], <4 x float> undef), !invariant.load !0
 ;
-  float* %output) local_unnamed_addr #0 {
 entry:
   br label %loop.header
 
@@ -594,49 +323,18 @@ loop.exit:
 ; of any masked load/store/gather/scatter.
 define void @preserve_exact_no_addr(i64* %output) local_unnamed_addr #0 {
 ; CHECK-LABEL: @preserve_exact_no_addr(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[I27:%.*]] = sdiv exact i64 [[IV]], 2
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I34:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[I27]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[I35:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[I34]], i64* [[I35]], align 4
-; CHECK-NEXT:    [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP9]], align 4
 entry:
   br label %loop.header
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
index caf63ce63d325..162fb6c4f87a4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
@@ -9,53 +9,6 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %neg = fneg float %{{.*}}
 ; CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %neg = fneg float %{{.*}}
 define void @fneg_cost(float* %a, i64 %n) {
-; CHECK-LABEL: @fneg_cost(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[TMP13]]
-; CHECK-NEXT:    store float [[NEG]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 for.body:                                         ; preds = %for.body.preheader, %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
index 88a3497803531..e8157d2816dd2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
@@ -8,75 +7,6 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: cost of 1 for VF 1 For instruction:   %conv = fptosi float %tmp to i8
 define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
-; CHECK-LABEL: @float_to_sint8_cost(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = fptosi <8 x float> [[WIDE_LOAD]] to <8 x i8>
-; CHECK-NEXT:    [[TMP17:%.*]] = fptosi <8 x float> [[WIDE_LOAD1]] to <8 x i8>
-; CHECK-NEXT:    [[TMP18:%.*]] = fptosi <8 x float> [[WIDE_LOAD2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = fptosi <8 x float> [[WIDE_LOAD3]] to <8 x i8>
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP16]], <8 x i8>* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 8
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP17]], <8 x i8>* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8* [[TMP28]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP18]], <8 x i8>* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 24
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP19]], <8 x i8>* [[TMP31]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP]] to i8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[CONV]], i8* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 for.body:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 1dd78ddfab37a..0007427eecb96 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -175,31 +175,31 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %tr
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT:    [[TMP7:%.*]] = load float, float* [[B]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[B]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
@@ -322,31 +322,31 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noali
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT:    [[TMP7:%.*]] = load float, float* [[B]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[B]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
 ; AVX512-NEXT:    [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV]], i32 1
 ; AVX512-NEXT:    store float [[ADD]], float* [[B6]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
@@ -456,31 +456,31 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspac
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT:    [[TMP7:%.*]] = load float, float addrspace(1)* [[B]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT:    [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
@@ -589,31 +589,31 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspa
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT:    [[TMP7:%.*]] = load float, float addrspace(1)* [[B]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT:    [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
@@ -722,31 +722,31 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspa
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT:    [[TMP7:%.*]] = load float, float* [[B]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[B]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index e12411c77a040..d2543a2caacff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -63,34 +63,34 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
-; CHECK-NEXT:    store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
-; CHECK-NEXT:    store i32 [[TMP21]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
-; CHECK-NEXT:    store i32 [[TMP22]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
-; CHECK-NEXT:    store i32 [[TMP23]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[ADD_US]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index 71cd7f0aee4d3..e909403df52f3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -108,7 +108,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
 ; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
@@ -133,7 +133,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; AVX-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
 ; AVX-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
 ; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
+; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], [[LOOP2:!llvm.loop !.*]]
 ; AVX:       done:
 ; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
index 39a400f58e253..d438fa0506287 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -S | FileCheck %s
 
 ; This test checks that gather/scatter not used for i128 data type.
@@ -14,54 +13,9 @@ target triple = "x86_64-unknown-linux-gnu"
 @str = private unnamed_addr constant [45 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\00"
 
 ; Function Attrs: noinline nounwind uwtable
-declare i32 @y3inner() #0
+declare i32 @y3inner() #0 
 
 define i32 @main() local_unnamed_addr #0 {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[DO_BODY:%.*]]
-; CHECK:       do.body:
-; CHECK-NEXT:    [[J_0:%.*]] = phi i128 [ 99999, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[DO_BODY]] ]
-; CHECK-NEXT:    [[I_0:%.*]] = phi i128 [ 1, [[ENTRY]] ], [ [[ADD11:%.*]], [[DO_BODY]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i128 [[J_0]], 32767
-; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i128 [[I_0]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    store i128 [[AND]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i128 [[J_0]], 11111
-; CHECK-NEXT:    [[AND1:%.*]] = and i128 [[ADD]], 32767
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i128 [[I_0]], 1
-; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i128 [[ADD2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 [[IDXPROM3]]
-; CHECK-NEXT:    store i128 [[AND1]], i128* [[ARRAYIDX4]], align 16
-; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i128 [[J_0]], 22222
-; CHECK-NEXT:    [[AND6:%.*]] = and i128 [[ADD5]], 32767
-; CHECK-NEXT:    [[ADD7:%.*]] = add nuw nsw i128 [[I_0]], 2
-; CHECK-NEXT:    [[IDXPROM8:%.*]] = trunc i128 [[ADD7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 [[IDXPROM8]]
-; CHECK-NEXT:    store i128 [[AND6]], i128* [[ARRAYIDX9]], align 16
-; CHECK-NEXT:    [[ADD10]] = add nuw nsw i128 [[J_0]], 33333
-; CHECK-NEXT:    [[ADD11]] = add nuw nsw i128 [[I_0]], 3
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i128 [[ADD11]], 149
-; CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]]
-; CHECK:       do.end:
-; CHECK-NEXT:    store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 149), align 16
-; CHECK-NEXT:    store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 150), align 16
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @y3inner()
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 25), align 16
-; CHECK-NEXT:    [[CMP12:%.*]] = icmp eq i128 [[TMP0]], 5085
-; CHECK-NEXT:    br i1 [[CMP12]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[PUTS:%.*]] = tail call i32 @puts(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @str, i64 0, i64 0))
-; CHECK-NEXT:    br label [[IF_END:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[COERCE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i128 [[TMP0]] to i64
-; CHECK-NEXT:    [[COERCE_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i128 [[TMP0]], 64
-; CHECK-NEXT:    [[COERCE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i128 [[COERCE_SROA_2_0_EXTRACT_SHIFT]] to i64
-; CHECK-NEXT:    [[CALL14:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.1, i64 0, i64 0), i64 [[COERCE_SROA_0_0_EXTRACT_TRUNC]], i64 [[COERCE_SROA_2_0_EXTRACT_TRUNC]])
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    ret i32 0
-;
 entry:
   br label %do.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
index 3b6a4884440ce..b1d56e3854e43 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -mtriple x86_64 -S | FileCheck %s
 
 %struct.ST4 = type { i32, i32, i32, i32 }
@@ -8,26 +7,17 @@
 
 ; Test from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=7560
 define void @test1(%struct.ST4* noalias %B) {
-; CHECK-LABEL: @test1(
+; CHECK-LABEL: @test1
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT:    store i32 65536, i32* [[P1]], align 4
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, i32* [[P1]], i32 -2147483648
-; CHECK-NEXT:    store i32 65536, i32* [[P2]], align 4
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 2
-; CHECK-NEXT:    store i32 10, i32* [[P3]], align 4
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 3
-; CHECK-NEXT:    store i32 12, i32* [[P4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    br label %for.body
 
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
 entry:
   br label %for.body
 
@@ -52,23 +42,15 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; Make sure interleave groups with a key being the special 'empty' value for
 ; the map do not cause a crash.
 define void @test_gap_empty_key() {
-; CHECK-LABEL: @test_gap_empty_key(
+; CHECK-LABEL: @test_gap_empty_key()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* undef, i64 0, i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr i32, i32* [[ARRAYIDX]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr i32, i32* [[G2]], i32 -2147483647
-; CHECK-NEXT:    store i32 0, i32* [[G2]], align 4
-; CHECK-NEXT:    store i32 1, i32* [[G9]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    br label %for.body
 
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
 entry:
   br label %for.body
 
@@ -90,23 +72,15 @@ exit:
 ; Make sure interleave groups with a key being the special 'tombstone' value for
 ; the map do not cause a crash.
 define void @test_tombstone_key() {
-; CHECK-LABEL: @test_tombstone_key(
+; CHECK-LABEL: @test_tombstone_key()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* undef, i64 0, i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr i32, i32* [[ARRAYIDX]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr i32, i32* [[G2]], i32 -2147483648
-; CHECK-NEXT:    store i32 0, i32* [[G2]], align 4
-; CHECK-NEXT:    store i32 1, i32* [[G9]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    br label %for.body
 
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index bfd6c2739781a..a3b4ac77b743e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -75,10 +75,10 @@ define void @uaddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END27:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END24:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[CAST_CRD20:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END21:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD20]]
+; CHECK-NEXT:    [[IND_END29:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END26:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[CAST_CRD22:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END23:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD22]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 56
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -87,40 +87,40 @@ define void @uaddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
 ; CHECK-NEXT:    [[N_VEC19:%.*]] = and i64 [[TMP2]], 8589934584
 ; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT:    [[IND_END23:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC19]]
-; CHECK-NEXT:    [[IND_END26:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC19]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT34:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT33]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END25:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC19]]
+; CHECK-NEXT:    [[IND_END28:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC19]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT35]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX29]]
-; CHECK-NEXT:    [[NEXT_GEP31:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX29]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[NEXT_GEP30]] to <8 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <8 x i16>, <8 x i16>* [[TMP22]], align 2
-; CHECK-NEXT:    [[TMP23:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD32]], <8 x i16> [[BROADCAST_SPLAT34]])
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16* [[NEXT_GEP31]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP23]], <8 x i16>* [[TMP24]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 8
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC19]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP32:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX20]]
+; CHECK-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i16* [[NEXT_GEP32]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <8 x i16>, <8 x i16>* [[TMP25]], align 2
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD34]], <8 x i16> [[BROADCAST_SPLAT36]])
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16* [[NEXT_GEP33]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP26]], <8 x i16>* [[TMP27]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT21]] = add nuw i64 [[INDEX20]], 8
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC19]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
-; CHECK-NEXT:    br i1 [[CMP_N28]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N30:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
+; CHECK-NEXT:    br i1 [[CMP_N30]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL22:%.*]] = phi i16* [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL25:%.*]] = phi i16* [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL24:%.*]] = phi i16* [ [[IND_END25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END26]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL27:%.*]] = phi i16* [ [[IND_END28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END29]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL24]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL27]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT:    [[TMP26:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT:    [[TMP27:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP26]], i16 [[OFFSET]])
+; CHECK-NEXT:    [[TMP29:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP29]], i16 [[OFFSET]])
 ; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT:    store i16 [[TMP27]], i16* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT:    store i16 [[TMP30]], i16* [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -218,10 +218,10 @@ define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END27:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END24:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[CAST_CRD20:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END21:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD20]]
+; CHECK-NEXT:    [[IND_END29:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END26:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[CAST_CRD22:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END23:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD22]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 112
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -230,40 +230,40 @@ define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
 ; CHECK-NEXT:    [[N_VEC19:%.*]] = and i64 [[TMP2]], 8589934576
 ; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT:    [[IND_END23:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]]
-; CHECK-NEXT:    [[IND_END26:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT34:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT33]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END25:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]]
+; CHECK-NEXT:    [[IND_END28:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT35]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX29]]
-; CHECK-NEXT:    [[NEXT_GEP31:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX29]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8* [[NEXT_GEP30]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 2
-; CHECK-NEXT:    [[TMP23:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD32]], <16 x i8> [[WIDE_LOAD32]], <16 x i8> [[BROADCAST_SPLAT34]])
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[NEXT_GEP31]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP23]], <16 x i8>* [[TMP24]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 16
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC19]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP32:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX20]]
+; CHECK-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP32]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <16 x i8>, <16 x i8>* [[TMP25]], align 2
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD34]], <16 x i8> [[WIDE_LOAD34]], <16 x i8> [[BROADCAST_SPLAT36]])
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP33]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP26]], <16 x i8>* [[TMP27]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT21]] = add nuw i64 [[INDEX20]], 16
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC19]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
-; CHECK-NEXT:    br i1 [[CMP_N28]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N30:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
+; CHECK-NEXT:    br i1 [[CMP_N30]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL22:%.*]] = phi i8* [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL25:%.*]] = phi i8* [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL24:%.*]] = phi i8* [ [[IND_END25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END26]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL27:%.*]] = phi i8* [ [[IND_END28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END29]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL24]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL27]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT:    [[TMP27:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP26]], i8 [[TMP26]], i8 [[OFFSET]])
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP29]], i8 [[TMP29]], i8 [[OFFSET]])
 ; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT:    store i8 [[TMP27]], i8* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT:    store i8 [[TMP30]], i8* [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index 313c7a8a9f678..e33be35185941 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -50,29 +50,29 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC12:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT16]], <8 x i32*> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT18]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT17]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT19]], <8 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC12]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT17]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER20:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT17]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
-; CHECK-NEXT:    [[PREDPHI21:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER20]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC12]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI21]], i64 7
-; CHECK-NEXT:    br i1 [[CMP_N13]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
+; CHECK-NEXT:    [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7
+; CHECK-NEXT:    br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 048ebd51cf412..9fa21e83ca236 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -68,31 +68,31 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC17:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT:    [[N_VEC18:%.*]] = and i64 [[SMAX6]], 9223372036854775800
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX20]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17]] = add <8 x i32> [[VEC_PHI20]], [[WIDE_LOAD21]]
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP17]] = add <8 x i32> [[VEC_PHI21]], [[WIDE_LOAD22]]
 ; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC17]]
+; CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[INDEX20]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC18]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP17]])
-; CHECK-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC17]]
-; CHECK-NEXT:    br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N19:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC18]]
+; CHECK-NEXT:    br i1 [[CMP_N19]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX24:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX24]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
 ; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
 ; CHECK-NEXT:    [[T3]] = add i32 [[T0]], [[T2]]
@@ -177,31 +177,31 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b,
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT21]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC14:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT18]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT20]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT22:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT23:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT22]], <8 x i32*> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX16]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD16]], [[BROADCAST_SPLAT18]]
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD17]], [[BROADCAST_SPLAT19]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT20]], <8 x i32*> [[BROADCAST_SPLAT22]], i32 4, <8 x i1> [[TMP7]])
-; CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT21]], <8 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT21]], <8 x i32*> [[BROADCAST_SPLAT23]], i32 4, <8 x i1> [[TMP7]])
+; CHECK-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX16]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC14]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]]
-; CHECK-NEXT:    br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC14]]
+; CHECK-NEXT:    br i1 [[CMP_N15]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
@@ -307,34 +307,34 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32*
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC23:%.*]] = and i64 [[SMAX16]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT27]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT29]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT33:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT32]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC24:%.*]] = and i64 [[SMAX16]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT28]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT30]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT34:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT33]], <8 x i32*> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT34:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX26]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD26]], [[BROADCAST_SPLAT28]]
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD27]], [[BROADCAST_SPLAT29]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT30]], <8 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT31]], <8 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[INDEX26]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD31:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD31]], <8 x i32*> [[BROADCAST_SPLAT33]], i32 4, <8 x i1> [[TMP9]])
-; CHECK-NEXT:    [[INDEX_NEXT34]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT34]], [[N_VEC23]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD32]], <8 x i32*> [[BROADCAST_SPLAT34]], i32 4, <8 x i1> [[TMP9]])
+; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX26]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC24]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N24:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC23]]
-; CHECK-NEXT:    br i1 [[CMP_N24]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N25:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC24]]
+; CHECK-NEXT:    br i1 [[CMP_N25]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
index 4afa3cef5db02..6619d4ec067bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=LIBMVEC-X86  -inject-tli-mappings -loop-vectorize -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -6,50 +5,8 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @sin_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -76,42 +33,8 @@ for.end:
 
 define void @sin_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @sinf(float [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -137,50 +60,8 @@ for.end:
 
 define void @sin_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -206,42 +87,8 @@ for.end:
 
 define void @sin_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -267,50 +114,8 @@ for.end:
 
 define void @cos_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -336,42 +141,8 @@ for.end:
 
 define void @cos_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @cosf(float [[CONV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -397,50 +168,8 @@ for.end:
 
 define void @cos_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -466,42 +195,8 @@ for.end:
 
 define void @cos_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -527,44 +222,9 @@ for.end:
 
 
 define void @exp_f32(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @expf(float [[CONV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @exp_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <8 x float> @_ZGVdN8v_expf
 entry:
   br label %for.body
 
@@ -588,44 +248,9 @@ for.end:                                          ; preds = %for.body
 !93 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @exp_f32_intrin(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32_intrin(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @llvm.exp.f32(float [[CONV]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @exp_f32_intrin
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8v_expf
 entry:
   br label %for.body
 
@@ -650,44 +275,9 @@ for.end:                                          ; preds = %for.body
 
 
 define void @log_f32(float* nocapture %varray) {
-; CHECK-LABEL: @log_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <8 x float> @_ZGVdN8v_logf(<8 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @logf(float [[CONV]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @log_f32
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8v_logf
 entry:
   br label %for.body
 
@@ -711,61 +301,9 @@ for.end:                                          ; preds = %for.body
 !113 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4, !alias.scope !24
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <8 x float> @_ZGVdN8vv_powf(<8 x float> [[TMP1]], <8 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP5]], <8 x float>* [[TMP8]], align 4, !alias.scope !27, !noalias !24
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @powf(float [[CONV]], float [[TMP1]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @pow_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <8 x float> @_ZGVdN8vv_powf
 entry:
   br label %for.body
 
@@ -791,61 +329,9 @@ for.end:                                          ; preds = %for.body
 !123 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @pow_f32_intrin(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32_intrin(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4, !alias.scope !31
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <8 x float> @_ZGVdN8vv_powf(<8 x float> [[TMP1]], <8 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP5]], <8 x float>* [[TMP8]], align 4, !alias.scope !34, !noalias !31
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.pow.f32(float [[CONV]], float [[TMP1]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @pow_f32_intrin
+; CHECK-LABEL:    vector.body
+; CHECK: <8 x float> @_ZGVdN8vv_powf
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
index efe6ceda24bc9..68a4336007af6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
@@ -1,55 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=LIBMVEC-X86 -inject-tli-mappings -loop-vectorize -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @exp_f32(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v___expf_finite(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v___expf_finite(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @__expf_finite(float [[CONV]]) #[[ATTR0:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @exp_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x float> @_ZGVbN4v___expf_finite
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -73,44 +30,10 @@ for.end:                                          ; preds = %for.body
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @exp_f64(double* nocapture %varray) {
-; CHECK-LABEL: @exp_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <4 x double> @_ZGVdN4v___exp_finite(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast double @__exp_finite(double [[CONV]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @exp_f64
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x double> @_ZGVdN4v___exp_finite
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -134,52 +57,10 @@ for.end:                                          ; preds = %for.body
 !13 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @log_f32(float* nocapture %varray) {
-; CHECK-LABEL: @log_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v___logf_finite(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v___logf_finite(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @__logf_finite(float [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @log_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x float> @_ZGVbN4v___logf_finite
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -203,44 +84,10 @@ for.end:                                          ; preds = %for.body
 !23 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @log_f64(double* nocapture %varray) {
-; CHECK-LABEL: @log_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <4 x double> @_ZGVdN4v___log_finite(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast double @__log_finite(double [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @log_f64
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x double> @_ZGVdN4v___log_finite
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -264,73 +111,10 @@ for.end:                                          ; preds = %for.body
 !33 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !10
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !10
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast <4 x float> @_ZGVbN4vv___powf_finite(<4 x float> [[TMP2]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <4 x float> @_ZGVbN4vv___powf_finite(<4 x float> [[TMP3]], <4 x float> [[WIDE_LOAD7]])
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP10]], <4 x float>* [[TMP15]], align 4, !alias.scope !13, !noalias !10
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP17]], align 4, !alias.scope !13, !noalias !10
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @__powf_finite(float [[CONV]], float [[TMP1]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @pow_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x float> @_ZGVbN4vv___powf_finite
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -356,61 +140,10 @@ for.end:                                          ; preds = %for.body
 !43 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast double* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast double* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 4, !alias.scope !17
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x double> @_ZGVdN4vv___pow_finite(<4 x double> [[TMP1]], <4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP5]], <4 x double>* [[TMP8]], align 4, !alias.scope !20, !noalias !17
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast double @__pow_finite(double [[CONV]], double [[TMP1]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[TMP2]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @pow_f64
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x double> @_ZGVdN4vv___pow_finite
+; CHECK: ret
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
index 6d8e5fb7704e2..7778554501a5f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=LIBMVEC-X86  -inject-tli-mappings -loop-vectorize -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -6,42 +5,8 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @sin_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -68,50 +33,8 @@ for.end:
 
 define void @sin_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @sinf(float [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -137,42 +60,8 @@ for.end:
 
 define void @sin_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -198,50 +87,8 @@ for.end:
 
 define void @sin_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -267,42 +114,8 @@ for.end:
 
 define void @cos_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -328,50 +141,8 @@ for.end:
 
 define void @cos_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @cosf(float [[CONV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -397,42 +168,8 @@ for.end:
 
 define void @cos_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -458,50 +195,8 @@ for.end:
 
 define void @cos_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL:    vector.body
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -527,52 +222,9 @@ for.end:
 
 
 define void @exp_f32(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @expf(float [[CONV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @exp_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x float> @_ZGVbN4v_expf
 entry:
   br label %for.body
 
@@ -596,52 +248,9 @@ for.end:                                          ; preds = %for.body
 !93 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @exp_f32_intrin(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32_intrin(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @llvm.exp.f32(float [[CONV]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @exp_f32_intrin
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v_expf
 entry:
   br label %for.body
 
@@ -666,52 +275,9 @@ for.end:                                          ; preds = %for.body
 
 
 define void @log_f32(float* nocapture %varray) {
-; CHECK-LABEL: @log_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v_logf(<4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v_logf(<4 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @logf(float [[CONV]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @log_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v_logf
 entry:
   br label %for.body
 
@@ -735,73 +301,9 @@ for.end:                                          ; preds = %for.body
 !113 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !24
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !24
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP2]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP3]], <4 x float> [[WIDE_LOAD7]])
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP10]], <4 x float>* [[TMP15]], align 4, !alias.scope !27, !noalias !24
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP17]], align 4, !alias.scope !27, !noalias !24
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @powf(float [[CONV]], float [[TMP1]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @pow_f32
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x float> @_ZGVbN4vv_powf
 entry:
   br label %for.body
 
@@ -827,73 +329,9 @@ for.end:                                          ; preds = %for.body
 !123 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 define void @pow_f32_intrin(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32_intrin(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !31
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !31
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP2]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP3]], <4 x float> [[WIDE_LOAD7]])
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP10]], <4 x float>* [[TMP15]], align 4, !alias.scope !34, !noalias !31
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP17]], align 4, !alias.scope !34, !noalias !31
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.pow.f32(float [[CONV]], float [[TMP1]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @pow_f32_intrin
+; CHECK-LABEL:    vector.body
+; CHECK: <4 x float> @_ZGVbN4vv_powf
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index 19e10b316f8de..2cfa1de30c45a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -36,22 +36,22 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64
-; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
+; CHECK-NEXT:    store <8 x i8> [[WIDE_LOAD4]], <8 x i8>* [[TMP14]], align 64
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 16
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 17, 16
-; CHECK-NEXT:    br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N3:%.*]] = icmp eq i64 17, 16
+; CHECK-NEXT:    br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_MEMCPY_EXPANSION:%.*]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index a648387d1fd66..d3b79f6bd0d30 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -300,8 +300,8 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX19]], 0
 ; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP49]]
 ; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP50]], i32 0
 ; AVX512-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>*
@@ -316,7 +316,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr i32, i32* [[TMP58]], i32 0
 ; AVX512-NEXT:    [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
-; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
 ; AVX512-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
 ; AVX512-NEXT:    br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
@@ -659,8 +659,8 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX19]], 0
 ; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP49]]
 ; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP50]], i32 0
 ; AVX512-NEXT:    [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> addrspace(1)*
@@ -675,7 +675,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP58]], i32 0
 ; AVX512-NEXT:    [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)*
 ; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
-; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
 ; AVX512-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
 ; AVX512-NEXT:    br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
@@ -1038,8 +1038,8 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP53:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX19]], 0
 ; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP53]]
 ; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0
 ; AVX512-NEXT:    [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
@@ -1055,7 +1055,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr float, float* [[TMP63]], i32 0
 ; AVX512-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], <8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]])
-; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
 ; AVX512-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
 ; AVX512-NEXT:    br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
index 9aaf533f002b4..f824b84fd1b2d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
 
@@ -6,127 +5,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
-; SLM-LABEL: @mul_i8(
-; SLM-NEXT:  entry:
-; SLM-NEXT:    [[CMP12:%.*]] = icmp eq i32 [[N:%.*]], 0
-; SLM-NEXT:    br i1 [[CMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; SLM:       for.body.preheader:
-; SLM-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; SLM-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SLM:       vector.ph:
-; SLM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; SLM-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SLM:       vector.body:
-; SLM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SLM-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; SLM-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[DATAA:%.*]], i64 [[TMP0]]
-; SLM-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[DATAA]], i64 [[TMP1]]
-; SLM-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; SLM-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; SLM-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 4
-; SLM-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; SLM-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; SLM-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT:    [[TMP9:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[DATAB:%.*]], i64 [[TMP0]]
-; SLM-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[DATAB]], i64 [[TMP1]]
-; SLM-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0
-; SLM-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; SLM-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; SLM-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 4
-; SLM-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; SLM-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; SLM-NEXT:    [[TMP16:%.*]] = sext <4 x i8> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT:    [[TMP17:%.*]] = sext <4 x i8> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT:    [[TMP18:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP8]]
-; SLM-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP17]], [[TMP9]]
-; SLM-NEXT:    [[TMP20:%.*]] = zext <4 x i8> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT:    [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP8]]
-; SLM-NEXT:    [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP9]]
-; SLM-NEXT:    [[TMP24:%.*]] = add <4 x i32> [[TMP18]], [[TMP22]]
-; SLM-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP19]], [[TMP23]]
-; SLM-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT:    [[TMP27:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT:    [[TMP28:%.*]] = mul nsw <4 x i32> [[TMP26]], [[TMP20]]
-; SLM-NEXT:    [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP27]], [[TMP21]]
-; SLM-NEXT:    [[TMP30:%.*]] = add <4 x i32> [[TMP24]], [[TMP28]]
-; SLM-NEXT:    [[TMP31:%.*]] = add <4 x i32> [[TMP25]], [[TMP29]]
-; SLM-NEXT:    [[TMP32:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP16]]
-; SLM-NEXT:    [[TMP33:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP17]]
-; SLM-NEXT:    [[TMP34:%.*]] = add <4 x i32> [[TMP30]], [[TMP32]]
-; SLM-NEXT:    [[TMP35:%.*]] = add <4 x i32> [[TMP31]], [[TMP33]]
-; SLM-NEXT:    [[TMP36:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP16]]
-; SLM-NEXT:    [[TMP37:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP17]]
-; SLM-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[TMP34]], [[TMP36]]
-; SLM-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[TMP35]], [[TMP37]]
-; SLM-NEXT:    [[TMP40:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP20]]
-; SLM-NEXT:    [[TMP41:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP21]]
-; SLM-NEXT:    [[TMP42:%.*]] = add <4 x i32> [[TMP38]], [[TMP40]]
-; SLM-NEXT:    [[TMP43:%.*]] = add <4 x i32> [[TMP39]], [[TMP41]]
-; SLM-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP20]]
-; SLM-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP21]]
-; SLM-NEXT:    [[TMP46:%.*]] = add <4 x i32> [[TMP42]], [[TMP44]]
-; SLM-NEXT:    [[TMP47:%.*]] = add <4 x i32> [[TMP43]], [[TMP45]]
-; SLM-NEXT:    [[TMP48:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP49:%.*]] = add <4 x i32> [[VEC_PHI1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP50]] = add <4 x i32> [[TMP48]], [[TMP46]]
-; SLM-NEXT:    [[TMP51]] = add <4 x i32> [[TMP49]], [[TMP47]]
-; SLM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SLM-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SLM-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; SLM:       middle.block:
-; SLM-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
-; SLM-NEXT:    [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; SLM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; SLM-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; SLM:       scalar.ph:
-; SLM-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; SLM-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT:    br label [[FOR_BODY:%.*]]
-; SLM:       for.cond.cleanup.loopexit:
-; SLM-NEXT:    [[ADD4_LCSSA:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT:    [[PHITMP:%.*]] = trunc i32 [[ADD4_LCSSA]] to i8
-; SLM-NEXT:    br label [[FOR_COND_CLEANUP]]
-; SLM:       for.cond.cleanup:
-; SLM-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; SLM-NEXT:    ret i8 [[ACC_0_LCSSA]]
-; SLM:       for.body:
-; SLM-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SLM-NEXT:    [[ACC_013:%.*]] = phi i32 [ [[ADD4]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; SLM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[DATAA]], i64 [[INDVARS_IV]]
-; SLM-NEXT:    [[TMP54:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; SLM-NEXT:    [[CONV:%.*]] = sext i8 [[TMP54]] to i32
-; SLM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DATAB]], i64 [[INDVARS_IV]]
-; SLM-NEXT:    [[TMP55:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; SLM-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP55]] to i32
-; SLM-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; SLM-NEXT:    [[CONV4:%.*]] = zext i8 [[TMP55]] to i32
-; SLM-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
-; SLM-NEXT:    [[SUM0:%.*]] = add i32 [[MUL]], [[MUL2]]
-; SLM-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP54]] to i32
-; SLM-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[CONV5]], [[CONV4]]
-; SLM-NEXT:    [[SUM1:%.*]] = add i32 [[SUM0]], [[MUL3]]
-; SLM-NEXT:    [[MUL4:%.*]] = mul nsw i32 -120, [[CONV3]]
-; SLM-NEXT:    [[SUM2:%.*]] = add i32 [[SUM1]], [[MUL4]]
-; SLM-NEXT:    [[MUL5:%.*]] = mul nsw i32 250, [[CONV3]]
-; SLM-NEXT:    [[SUM3:%.*]] = add i32 [[SUM2]], [[MUL5]]
-; SLM-NEXT:    [[MUL6:%.*]] = mul nsw i32 -120, [[CONV4]]
-; SLM-NEXT:    [[SUM4:%.*]] = add i32 [[SUM3]], [[MUL6]]
-; SLM-NEXT:    [[MUL7:%.*]] = mul nsw i32 250, [[CONV4]]
-; SLM-NEXT:    [[SUM5:%.*]] = add i32 [[SUM4]], [[MUL7]]
-; SLM-NEXT:    [[ADD:%.*]] = add i32 [[ACC_013]], 5
-; SLM-NEXT:    [[ADD4]] = add i32 [[ADD]], [[SUM5]]
-; SLM-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SLM-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; SLM-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
 entry:
   %cmp12 = icmp eq i32 %N, 0
   br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
@@ -152,33 +30,40 @@ for.body:                                         ; preds = %for.body.preheader,
   %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
   %1 = load i8, i8* %arrayidx2, align 1
   %conv3 = sext i8 %1 to i32
-; sources of the mul is sext\sext from i8
-; use pmullw\sext seq.
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 %conv3, %conv
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i8
 ; use pmulhw\pmullw\pshuf
+; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv4, %conv
   %conv4 = zext i8 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i8
 ; use pmullw\zext
+; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv5, %conv4
   %conv5 = zext i8 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-120
 ; use pmullw\sext
+; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 -120, %conv3
   %mul4 = mul nsw i32 -120, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\250
 ; use pmulhw\pmullw\pshuf
+; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv3
   %mul5 = mul nsw i32 250, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-120
 ; use pmulhw\pmullw\pshuf
+; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 -120, %conv4
   %mul6 = mul nsw i32 -120, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\250
 ; use pmullw\zext
+; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv4
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
@@ -189,127 +74,6 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
-; SLM-LABEL: @mul_i16(
-; SLM-NEXT:  entry:
-; SLM-NEXT:    [[CMP12:%.*]] = icmp eq i32 [[N:%.*]], 0
-; SLM-NEXT:    br i1 [[CMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; SLM:       for.body.preheader:
-; SLM-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; SLM-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SLM:       vector.ph:
-; SLM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; SLM-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SLM:       vector.body:
-; SLM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SLM-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; SLM-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[DATAA:%.*]], i64 [[TMP0]]
-; SLM-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[DATAA]], i64 [[TMP1]]
-; SLM-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; SLM-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 1
-; SLM-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 4
-; SLM-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; SLM-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 1
-; SLM-NEXT:    [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[DATAB:%.*]], i64 [[TMP0]]
-; SLM-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[DATAB]], i64 [[TMP1]]
-; SLM-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0
-; SLM-NEXT:    [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>*
-; SLM-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP13]], align 1
-; SLM-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 4
-; SLM-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <4 x i16>*
-; SLM-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP15]], align 1
-; SLM-NEXT:    [[TMP16:%.*]] = sext <4 x i16> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT:    [[TMP17:%.*]] = sext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT:    [[TMP18:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP8]]
-; SLM-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP17]], [[TMP9]]
-; SLM-NEXT:    [[TMP20:%.*]] = zext <4 x i16> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT:    [[TMP21:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT:    [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP8]]
-; SLM-NEXT:    [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP9]]
-; SLM-NEXT:    [[TMP24:%.*]] = add <4 x i32> [[TMP18]], [[TMP22]]
-; SLM-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP19]], [[TMP23]]
-; SLM-NEXT:    [[TMP26:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT:    [[TMP27:%.*]] = zext <4 x i16> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT:    [[TMP28:%.*]] = mul nsw <4 x i32> [[TMP26]], [[TMP20]]
-; SLM-NEXT:    [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP27]], [[TMP21]]
-; SLM-NEXT:    [[TMP30:%.*]] = add <4 x i32> [[TMP24]], [[TMP28]]
-; SLM-NEXT:    [[TMP31:%.*]] = add <4 x i32> [[TMP25]], [[TMP29]]
-; SLM-NEXT:    [[TMP32:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP16]]
-; SLM-NEXT:    [[TMP33:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP17]]
-; SLM-NEXT:    [[TMP34:%.*]] = add <4 x i32> [[TMP30]], [[TMP32]]
-; SLM-NEXT:    [[TMP35:%.*]] = add <4 x i32> [[TMP31]], [[TMP33]]
-; SLM-NEXT:    [[TMP36:%.*]] = mul nsw <4 x i32> <i32 64000, i32 64000, i32 64000, i32 64000>, [[TMP16]]
-; SLM-NEXT:    [[TMP37:%.*]] = mul nsw <4 x i32> <i32 64000, i32 64000, i32 64000, i32 64000>, [[TMP17]]
-; SLM-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[TMP34]], [[TMP36]]
-; SLM-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[TMP35]], [[TMP37]]
-; SLM-NEXT:    [[TMP40:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP20]]
-; SLM-NEXT:    [[TMP41:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP21]]
-; SLM-NEXT:    [[TMP42:%.*]] = add <4 x i32> [[TMP38]], [[TMP40]]
-; SLM-NEXT:    [[TMP43:%.*]] = add <4 x i32> [[TMP39]], [[TMP41]]
-; SLM-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP20]]
-; SLM-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP21]]
-; SLM-NEXT:    [[TMP46:%.*]] = add <4 x i32> [[TMP42]], [[TMP44]]
-; SLM-NEXT:    [[TMP47:%.*]] = add <4 x i32> [[TMP43]], [[TMP45]]
-; SLM-NEXT:    [[TMP48:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP49:%.*]] = add <4 x i32> [[VEC_PHI1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP50]] = add <4 x i32> [[TMP48]], [[TMP46]]
-; SLM-NEXT:    [[TMP51]] = add <4 x i32> [[TMP49]], [[TMP47]]
-; SLM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SLM-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SLM-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; SLM:       middle.block:
-; SLM-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
-; SLM-NEXT:    [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; SLM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; SLM-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; SLM:       scalar.ph:
-; SLM-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; SLM-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT:    br label [[FOR_BODY:%.*]]
-; SLM:       for.cond.cleanup.loopexit:
-; SLM-NEXT:    [[ADD4_LCSSA:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT:    [[PHITMP:%.*]] = trunc i32 [[ADD4_LCSSA]] to i16
-; SLM-NEXT:    br label [[FOR_COND_CLEANUP]]
-; SLM:       for.cond.cleanup:
-; SLM-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; SLM-NEXT:    ret i16 [[ACC_0_LCSSA]]
-; SLM:       for.body:
-; SLM-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SLM-NEXT:    [[ACC_013:%.*]] = phi i32 [ [[ADD4]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; SLM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[DATAA]], i64 [[INDVARS_IV]]
-; SLM-NEXT:    [[TMP54:%.*]] = load i16, i16* [[ARRAYIDX]], align 1
-; SLM-NEXT:    [[CONV:%.*]] = sext i16 [[TMP54]] to i32
-; SLM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[DATAB]], i64 [[INDVARS_IV]]
-; SLM-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 1
-; SLM-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
-; SLM-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; SLM-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP55]] to i32
-; SLM-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
-; SLM-NEXT:    [[SUM0:%.*]] = add i32 [[MUL]], [[MUL2]]
-; SLM-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP54]] to i32
-; SLM-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[CONV5]], [[CONV4]]
-; SLM-NEXT:    [[SUM1:%.*]] = add i32 [[SUM0]], [[MUL3]]
-; SLM-NEXT:    [[MUL4:%.*]] = mul nsw i32 -32000, [[CONV3]]
-; SLM-NEXT:    [[SUM2:%.*]] = add i32 [[SUM1]], [[MUL4]]
-; SLM-NEXT:    [[MUL5:%.*]] = mul nsw i32 64000, [[CONV3]]
-; SLM-NEXT:    [[SUM3:%.*]] = add i32 [[SUM2]], [[MUL5]]
-; SLM-NEXT:    [[MUL6:%.*]] = mul nsw i32 -32000, [[CONV4]]
-; SLM-NEXT:    [[SUM4:%.*]] = add i32 [[SUM3]], [[MUL6]]
-; SLM-NEXT:    [[MUL7:%.*]] = mul nsw i32 250, [[CONV4]]
-; SLM-NEXT:    [[SUM5:%.*]] = add i32 [[SUM4]], [[MUL7]]
-; SLM-NEXT:    [[ADD:%.*]] = add i32 [[ACC_013]], 5
-; SLM-NEXT:    [[ADD4]] = add i32 [[ADD]], [[SUM5]]
-; SLM-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SLM-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; SLM-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
 entry:
   %cmp12 = icmp eq i32 %N, 0
   br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
@@ -335,33 +99,40 @@ for.body:                                         ; preds = %for.body.preheader,
   %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
   %1 = load i16, i16* %arrayidx2, align 1
   %conv3 = sext i16 %1 to i32
-; sources of the mul is sext\sext from i16
-; use pmulhw\pmullw\pshuf seq.
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32 %conv3, %conv
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i16
 ; use pmulld
+; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv4, %conv
   %conv4 = zext i16 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i16
 ; use pmulhw\pmullw\zext
+; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv5, %conv4
   %conv5 = zext i16 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-32000
 ; use pmulhw\pmullw\sext
+; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 -32000, %conv3
   %mul4 = mul nsw i32 -32000, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\64000
 ; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 64000, %conv3
   %mul5 = mul nsw i32 64000, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-32000
 ; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 -32000, %conv4
   %mul6 = mul nsw i32 -32000, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\64000
 ; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32 250, %conv4
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
index 116fb5a2cb628..8618ed90d7f6c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
 
 ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
@@ -10,31 +9,6 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 ; Function Attrs: nounwind readonly ssp uwtable
 define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 {
-; CHECK-LABEL: @cond_sum(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG9:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG10:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG11:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; CHECK-NEXT:    ret double [[A_0_LCSSA]], !dbg [[DBG11]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG10]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG10]], !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0, !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG10]]
-; CHECK-NEXT:    [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG9]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG9]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG9]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG9]]
-;
 entry:
   %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
   br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8
@@ -66,71 +40,6 @@ for.body:                                         ; preds = %for.body.preheader,
 
 ; Function Attrs: nounwind readonly ssp uwtable
 define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 {
-; CHECK-LABEL: @cond_sum_loop_hint(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG20:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1, !dbg [[DBG21:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg [[DBG21]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4, !dbg [[DBG21]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG21]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4, !dbg [[DBG21]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]], !dbg [[DBG21]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG21]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG20]]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ <double 0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x double> [ <double -0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP3]], !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP4]], !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*, !dbg [[DBG21]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !dbg [[DBG21]], !tbaa [[TBAA12]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 2, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*, !dbg [[DBG21]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4, !dbg [[DBG21]], !tbaa [[TBAA12]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer, !dbg [[DBG22:![0-9]+]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD2]], zeroinitializer, !dbg [[DBG22]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x double> <double 3.400000e+00, double 3.400000e+00>, <2 x double> <double 1.150000e+00, double 1.150000e+00>, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[TMP12]], <2 x double> <double 3.400000e+00, double 3.400000e+00>, <2 x double> <double 1.150000e+00, double 1.150000e+00>, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP15]] = fadd <2 x double> [[VEC_PHI]], [[TMP13]], !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    [[TMP16]] = fadd <2 x double> [[VEC_PHI1]], [[TMP14]], !dbg [[DBG23]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG20]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG20]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG20]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd <2 x double> [[TMP16]], [[TMP15]], !dbg [[DBG20]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]]), !dbg [[DBG20]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg [[DBG20]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG20]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG21]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG26:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret double [[A_0_LCSSA]], !dbg [[DBG26]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[INDVARS_IV]], !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG21]], !tbaa [[TBAA12]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP19]], 0, !dbg [[DBG22]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG21]]
-; CHECK-NEXT:    [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG23]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG20]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG20]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG20]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG20]], !llvm.loop [[LOOP27:![0-9]+]]
-;
 entry:
   %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
   br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21
@@ -193,6 +102,6 @@ attributes #0 = { nounwind }
 !26 = distinct !{!26, !27}
 !27 = !{!"llvm.loop.vectorize.enable", i1 true}
 !28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
-  file: !5,
-  isOptimized: true, flags: "-O2",
-  splitDebugFilename: "abc.debug", emissionKind: 2)
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
index d124b9780b0de..f593b9423e337 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
 ; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
 
@@ -11,31 +10,6 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 ; Function Attrs: nounwind readonly ssp uwtable
 define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {
-; CHECK-LABEL: @cond_sum(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG9:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG10:![0-9]+]], !prof [[PROF11:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG12:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG13:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; CHECK-NEXT:    ret double [[A_0_LCSSA]], !dbg [[DBG13]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG12]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG12]], !tbaa [[TBAA14:![0-9]+]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0, !dbg [[DBG18:![0-9]+]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG12]]
-; CHECK-NEXT:    [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG10]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG10]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG10]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG10]], !prof [[PROF20:![0-9]+]], !llvm.loop [[LOOP21:![0-9]+]]
-;
 entry:
   %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
   br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8, !prof !30
@@ -67,61 +41,6 @@ for.body:                                         ; preds = %for.body.preheader,
 
 ; Function Attrs: nounwind readonly ssp uwtable
 define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 !prof !29{
-; CHECK-LABEL: @cond_sum_loop_hint(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG25:![0-9]+]], !prof [[PROF11]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg [[DBG26]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2, !dbg [[DBG26]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG26]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2, !dbg [[DBG26]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]], !dbg [[DBG26]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG26]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG25]]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ <double 0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP3]], !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0, !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*, !dbg [[DBG26]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !dbg [[DBG26]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer, !dbg [[DBG27:![0-9]+]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 3.400000e+00, double 3.400000e+00>, <2 x double> <double 1.150000e+00, double 1.150000e+00>, !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP9]] = fadd <2 x double> [[VEC_PHI]], [[TMP8]], !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG25]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG25]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG25]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP11:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP9]]), !dbg [[DBG25]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg [[DBG25]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG25]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG26]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG32:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret double [[A_0_LCSSA]], !dbg [[DBG32]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[INDVARS_IV]], !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG26]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP12]], 0, !dbg [[DBG27]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG26]]
-; CHECK-NEXT:    [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG28]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG25]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG25]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG25]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG25]], !prof [[PROF33:![0-9]+]], !llvm.loop [[LOOP34:![0-9]+]]
-;
 entry:
   %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
   br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21, !prof !30
@@ -186,9 +105,9 @@ attributes #0 = { nounwind }
 !26 = distinct !{!26, !27, !18}
 !27 = !{!"llvm.loop.vectorize.enable", i1 true}
 !28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
-  file: !5,
-  isOptimized: true, flags: "-O2",
-  splitDebugFilename: "abc.debug", emissionKind: 2)
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
 !29 = !{!"function_entry_count", i64 3}
 !30 = !{!"branch_weights", i32 99, i32 1}
 !31 = !{!"branch_weights", i32 1, i32 99}

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index d52bfc0a7cfd9..db066a2156b1e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -20,8 +20,8 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
@@ -47,7 +47,7 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -61,8 +61,8 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i32 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer
-; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; AUTOVF-NEXT:    [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
 ; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
@@ -88,7 +88,7 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
 ; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ; AUTOVF:       for.end:
 ; AUTOVF-NEXT:    ret i32 0
 ;
@@ -124,8 +124,8 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
@@ -136,7 +136,7 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -151,7 +151,7 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -165,8 +165,8 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i32 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer
-; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; AUTOVF-NEXT:    [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
 ; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
@@ -192,7 +192,7 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
 ; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; AUTOVF:       for.end:
 ; AUTOVF-NEXT:    ret i32 0
 ;
@@ -231,17 +231,17 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <64 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP2]], i32 4, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i32> undef)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <64 x i32>*
-; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP64:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP64]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP65]], i32 4, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i32> undef)
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP66]], i32 0
+; CHECK-NEXT:    [[TMP68:%.*]] = bitcast i32* [[TMP67]] to <64 x i32>*
+; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP68]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -252,12 +252,12 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; CHECK-NEXT:    store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    store i32 [[TMP70]], i32* [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -272,17 +272,17 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; AUTOVF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AUTOVF-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AUTOVF-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; AUTOVF-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP1]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
-; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; AUTOVF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; AUTOVF-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
-; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP5]], align 4
+; AUTOVF-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; AUTOVF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP8]]
+; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+; AUTOVF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; AUTOVF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; AUTOVF-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
+; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP12]], align 4
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; AUTOVF-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; AUTOVF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AUTOVF-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; AUTOVF-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; AUTOVF:       middle.block:
 ; AUTOVF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
 ; AUTOVF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -293,12 +293,12 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; AUTOVF-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; AUTOVF-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
 ; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]]
-; AUTOVF-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AUTOVF-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 ; AUTOVF-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; AUTOVF-NEXT:    store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4
+; AUTOVF-NEXT:    store i32 [[TMP14]], i32* [[ARRAYIDX1]], align 4
 ; AUTOVF-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; AUTOVF:       for.end.loopexit:
 ; AUTOVF-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
index dd902157ab21e..0e6f06497e306 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -10,33 +9,6 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ;CHECK-NOT: <4 x i32>
 define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
-; CHECK-LABEL: @parallel_loop(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_REG2MEM:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_RELOAD2:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[STOREMERGE]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[STOREMERGE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]]
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add i64 [[STOREMERGE]], 1
-; CHECK-NEXT:    store i64 [[INDVARS_IV_NEXT]], i64* [[INDVARS_IV_NEXT_REG2MEM]], align 8
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK:       for.body.for.body_crit_edge:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_RELOAD2]] = load i64, i64* [[INDVARS_IV_NEXT_REG2MEM]], align 8
-; CHECK-NEXT:    br label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   %indvars.iv.next.reg2mem = alloca i64
   %indvars.iv.reg2mem = alloca i64

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
index 0dcbb75acdd3c..31a70a36763b7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 
 ; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
@@ -14,133 +13,8 @@ declare double @llvm.pow.f64(double, double)
 ; TODO: should not be vectorized.
 define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[B3:%.*]] = bitcast double* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[E6:%.*]] = bitcast double* [[E:%.*]] to i8*
-; CHECK-NEXT:    [[C9:%.*]] = bitcast double* [[C:%.*]] to i8*
-; CHECK-NEXT:    [[D12:%.*]] = bitcast double* [[D:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 16
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[B]], i64 16
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[E]], i64 16
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr double, double* [[C]], i64 16
-; CHECK-NEXT:    [[SCEVGEP1011:%.*]] = bitcast double* [[SCEVGEP10]] to i8*
-; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr double, double* [[D]], i64 16
-; CHECK-NEXT:    [[SCEVGEP1314:%.*]] = bitcast double* [[SCEVGEP13]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND015:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND116:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT17]]
-; CHECK-NEXT:    [[BOUND018:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND119:%.*]] = icmp ult i8* [[C9]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT20:%.*]] = and i1 [[BOUND018]], [[BOUND119]]
-; CHECK-NEXT:    [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT20]]
-; CHECK-NEXT:    [[BOUND022:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND123:%.*]] = icmp ult i8* [[D12]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT24:%.*]] = and i1 [[BOUND022]], [[BOUND123]]
-; CHECK-NEXT:    [[CONFLICT_RDX25:%.*]] = or i1 [[CONFLICT_RDX21]], [[FOUND_CONFLICT24]]
-; CHECK-NEXT:    [[BOUND026:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND127:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT28:%.*]] = and i1 [[BOUND026]], [[BOUND127]]
-; CHECK-NEXT:    [[CONFLICT_RDX29:%.*]] = or i1 [[CONFLICT_RDX25]], [[FOUND_CONFLICT28]]
-; CHECK-NEXT:    [[BOUND030:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND131:%.*]] = icmp ult i8* [[C9]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT32:%.*]] = and i1 [[BOUND030]], [[BOUND131]]
-; CHECK-NEXT:    [[CONFLICT_RDX33:%.*]] = or i1 [[CONFLICT_RDX29]], [[FOUND_CONFLICT32]]
-; CHECK-NEXT:    [[BOUND034:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND135:%.*]] = icmp ult i8* [[D12]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]]
-; CHECK-NEXT:    [[CONFLICT_RDX37:%.*]] = or i1 [[CONFLICT_RDX33]], [[FOUND_CONFLICT36]]
-; CHECK-NEXT:    [[BOUND038:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND139:%.*]] = icmp ult i8* [[C9]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT40:%.*]] = and i1 [[BOUND038]], [[BOUND139]]
-; CHECK-NEXT:    [[CONFLICT_RDX41:%.*]] = or i1 [[CONFLICT_RDX37]], [[FOUND_CONFLICT40]]
-; CHECK-NEXT:    [[BOUND042:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND143:%.*]] = icmp ult i8* [[D12]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT44:%.*]] = and i1 [[BOUND042]], [[BOUND143]]
-; CHECK-NEXT:    [[CONFLICT_RDX45:%.*]] = or i1 [[CONFLICT_RDX41]], [[FOUND_CONFLICT44]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX45]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> zeroinitializer, <2 x double>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> <double 2.000000e+00, double 2.000000e+00>)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD46:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4, !alias.scope !8, !noalias !9
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[WIDE_LOAD46]], <2 x double> [[TMP5]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> zeroinitializer, <2 x double>* [[TMP10]], align 4, !alias.scope !8, !noalias !9
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD47:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 4, !alias.scope !10
-; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP5]], <2 x double> [[WIDE_LOAD47]])
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[D]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD48:%.*]] = load <2 x double>, <2 x double>* [[TMP17]], align 8, !alias.scope !11
-; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP14]], <2 x double> [[WIDE_LOAD48]])
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP18]], <2 x double> [[TMP14]])
-; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x double> <double 2.000000e+00, double 2.000000e+00>, [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x double> [[TMP20]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[E]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP22]], <2 x double>* [[TMP25]], align 4, !alias.scope !12, !noalias !13
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load double, double* [[GEP_A]], align 4
-; CHECK-NEXT:    store double 0.000000e+00, double* [[GEP_A]], align 4
-; CHECK-NEXT:    [[P_1:%.*]] = call double @llvm.pow.f64(double [[L_A]], double 2.000000e+00)
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, double* [[GEP_B]], align 4
-; CHECK-NEXT:    [[P_2:%.*]] = call double @llvm.pow.f64(double [[L_B]], double [[P_1]])
-; CHECK-NEXT:    store double 0.000000e+00, double* [[GEP_B]], align 4
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[IV]]
-; CHECK-NEXT:    [[L_C:%.*]] = load double, double* [[GEP_C]], align 4
-; CHECK-NEXT:    [[P_3:%.*]] = call double @llvm.pow.f64(double [[P_1]], double [[L_C]])
-; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds double, double* [[D]], i64 [[IV]]
-; CHECK-NEXT:    [[L_D:%.*]] = load double, double* [[GEP_D]], align 8
-; CHECK-NEXT:    [[P_4:%.*]] = call double @llvm.pow.f64(double [[P_3]], double [[L_D]])
-; CHECK-NEXT:    [[P_5:%.*]] = call double @llvm.pow.f64(double [[P_4]], double [[P_3]])
-; CHECK-NEXT:    [[MUL:%.*]] = fmul double 2.000000e+00, [[P_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul double [[MUL]], 2.000000e+00
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul double [[MUL]], [[MUL_2]]
-; CHECK-NEXT:    [[GEP_E:%.*]] = getelementptr inbounds double, double* [[E]], i64 [[IV]]
-; CHECK-NEXT:    store double [[MUL_3]], double* [[GEP_E]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK: vector.memcheck
+; CHECK: vector.body
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index 2dcbfd4f770f9..84df4aaf08af6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -31,7 +31,7 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -48,7 +48,7 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index 5bed4b8959ed6..cf75d309d458f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -73,19 +73,19 @@ define i32 @main(i32* %ptr) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP21]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 4
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP34]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
index da3e69c8ce47b..587fca649a40f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
@@ -1,68 +1,28 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
 define void @foo(i64* %ptr, i32* %ptr.2) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR_21:%.*]] = bitcast i32* [[PTR_2:%.*]] to i8*
-; CHECK-NEXT:    [[PTR3:%.*]] = bitcast i64* [[PTR:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PTR_2]], i64 1
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i64, i64* [[PTR]], i64 80
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i64* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[PTR_21]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[PTR3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 2, i64 3, i64 4, i64 5>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[VEC_IND]], <4 x i64>* [[TMP8]], align 8, !alias.scope !3
+; CHECK-NEXT:    [[TRUNC:%.+]] = trunc i64 [[OFFSET_IDX]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TRUNC]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TRUNC]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TRUNC]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TRUNC]], 3
+; CHECK-NEXT:    = add i64 [[INDEX]], 0
+; CHECK-NEXT:    store i32 [[TMP7]], i32* %ptr.2, align 4
+; CHECK-NEXT:    store i32 [[TMP8]], i32* %ptr.2, align 4
+; CHECK-NEXT:    store i32 [[TMP9]], i32* %ptr.2, align 4
+; CHECK-NEXT:    store i32 [[TMP10]], i32* %ptr.2, align 4
+; CHECK:         store <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 80, 80
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 80, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ 82, [[MIDDLE_BLOCK]] ], [ 2, [[ENTRY]] ], [ 2, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[CAN_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[CAN_IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], 4294967295
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    store i32 [[TMP12]], i32* [[PTR_2]], align 4
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 [[CAN_IV]]
-; CHECK-NEXT:    store i64 [[TMP10]], i64* [[GEP_PTR]], align 8
-; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], 80
-; CHECK-NEXT:    [[CAN_IV_NEXT]] = add nuw nsw i64 [[CAN_IV]], 1
-; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
index afa5d39bc1e9a..38ca5f964a093 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
@@ -6,92 +5,9 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Make sure that we can compile the test without crash.
 define void @barney() {
+
 ; CHECK-LABEL: @barney(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB2:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 undef, 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[BB2]], label [[BB5:%.*]]
-; CHECK:       bb5:
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 undef, i64 1)
-; CHECK-NEXT:    br label [[BB19:%.*]]
-; CHECK:       bb18:
-; CHECK-NEXT:    ret void
-; CHECK:       bb19:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ [[TMP65_LCSSA:%.*]], [[BB36:%.*]] ], [ undef, [[BB5]] ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 32
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 32
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[CAST_CRD]], 13
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[TMP22]], [[TMP0]]
-; CHECK-NEXT:    [[IND_END3:%.*]] = add i64 1, [[N_VEC]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TMP22]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 0, i32 13, i32 26, i32 39, i32 52, i32 65, i32 78, i32 91, i32 104, i32 117, i32 130, i32 143, i32 156, i32 169, i32 182, i32 195>
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <16 x i32> [[VEC_IND]], <i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208>
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[VEC_IND]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[STEP_ADD]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[STEP_ADD]], <i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[CAST_CMO:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[CAST_CMO]], 13
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i32 [[TMP22]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB46:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP22]], [[BB19]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 1, [[BB19]] ]
-; CHECK-NEXT:    br label [[BB50:%.*]]
-; CHECK:       bb33:
-; CHECK-NEXT:    [[TMP65_LCSSA]] = phi i32 [ [[TMP65:%.*]], [[BB62:%.*]] ]
-; CHECK-NEXT:    br i1 true, label [[BB18:%.*]], label [[BB36]]
-; CHECK:       bb36:
-; CHECK-NEXT:    br label [[BB19]]
-; CHECK:       bb46:
-; CHECK-NEXT:    [[TMP52_LCSSA:%.*]] = phi i32 [ [[TMP52:%.*]], [[BB50]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP55_LCSSA:%.*]] = phi i32 [ [[TMP55:%.*]], [[BB50]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP56_LCSSA:%.*]] = phi i64 [ [[TMP56:%.*]], [[BB50]] ], [ [[IND_END3]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br i1 true, label [[BB48:%.*]], label [[BB59:%.*]]
-; CHECK:       bb48:
-; CHECK-NEXT:    [[TMP52_LCSSA_LCSSA:%.*]] = phi i32 [ [[TMP52_LCSSA]], [[BB46]] ]
-; CHECK-NEXT:    [[TMP49:%.*]] = add i32 [[TMP52_LCSSA_LCSSA]], 14
-; CHECK-NEXT:    ret void
-; CHECK:       bb50:
-; CHECK-NEXT:    [[TMP52]] = phi i32 [ [[TMP55]], [[BB50]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = phi i64 [ [[TMP56]], [[BB50]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP52]], 12
-; CHECK-NEXT:    [[TMP55]] = add i32 [[TMP52]], 13
-; CHECK-NEXT:    [[TMP56]] = add nuw nsw i64 [[TMP53]], 1
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp ult i64 [[TMP53]], undef
-; CHECK-NEXT:    br i1 [[TMP58]], label [[BB50]], label [[BB46]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       bb59:
-; CHECK-NEXT:    br label [[BB62]]
-; CHECK:       bb62:
-; CHECK-NEXT:    [[TMP63:%.*]] = phi i32 [ [[TMP65]], [[BB68:%.*]] ], [ [[TMP55_LCSSA]], [[BB59]] ]
-; CHECK-NEXT:    [[TMP64:%.*]] = phi i64 [ [[TMP66:%.*]], [[BB68]] ], [ [[TMP56_LCSSA]], [[BB59]] ]
-; CHECK-NEXT:    [[TMP65]] = add i32 [[TMP63]], 13
-; CHECK-NEXT:    [[TMP66]] = add nuw nsw i64 [[TMP64]], 1
-; CHECK-NEXT:    [[TMP67:%.*]] = icmp ult i64 [[TMP66]], 2
-; CHECK-NEXT:    br i1 [[TMP67]], label [[BB68]], label [[BB33:%.*]]
-; CHECK:       bb68:
-; CHECK-NEXT:    br label [[BB62]]
-;
 
 bb:
   br label %bb2
@@ -149,60 +65,9 @@ bb68:                                             ; preds = %bb62
 }
 
 define i32 @foo(i32 addrspace(1)* %p) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[OUTER:%.*]]
-; CHECK:       outer:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[OUTER_LATCH]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 1, [[N_VEC]]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 6, [[TMP1]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 6, i32 8, i32 10, i32 12>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2]] = or <4 x i32> [[VEC_PHI]], [[VEC_IND]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+
+; CHECK-LABEL: foo
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[OUTER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 6, [[OUTER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[OUTER]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[INNER:%.*]]
-; CHECK:       inner:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP7:%.*]], [[INNER]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[TMP8:%.*]], [[INNER]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[TMP6:%.*]], [[INNER]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP6]] = add i32 [[B]], 2
-; CHECK-NEXT:    [[TMP7]] = or i32 [[TMP5]], [[B]]
-; CHECK-NEXT:    [[TMP8]] = add nuw nsw i32 [[A]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[IV]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[INNER]], label [[OUTER_LATCH]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       outer_latch:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP7]], [[INNER]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    store atomic i32 [[DOTLCSSA]], i32 addrspace(1)* [[P:%.*]] unordered, align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[IV]], 63
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[EXIT:%.*]], label [[OUTER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 0
-;
 
 entry:
   br label %outer

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
index 52440ec5afdbf..7516c055ab732 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -11,9 +11,9 @@ define zeroext i8 @sum() {
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16
@@ -24,7 +24,7 @@ define zeroext i8 @sum() {
 ; CHECK-NEXT:    [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 3af2649f5a854..f3cbb8728f63b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -37,17 +37,17 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE2-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP5]]
-; SSE2-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
-; SSE2-NEXT:    [[TMP12:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; SSE2-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP11]]
-; SSE2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[TMP10]]
-; SSE2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; SSE2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; SSE2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; SSE2-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
+; SSE2-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; SSE2-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP13]]
+; SSE2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP10]]
+; SSE2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; SSE2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
+; SSE2-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <4 x i32>*
+; SSE2-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP20]], align 4
 ; SSE2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SSE2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SSE2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE2:       middle.block:
 ; SSE2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; SSE2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -56,21 +56,21 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE2-NEXT:    br label [[FOR_BODY:%.*]]
 ; SSE2:       for.body:
 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE2-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP19]]
-; SSE2-NEXT:    [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; SSE2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP20]] to i32
-; SSE2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP19]]
-; SSE2-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; SSE2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP21]] to i32
+; SSE2-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
+; SSE2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; SSE2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP23]] to i32
+; SSE2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
+; SSE2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; SSE2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP24]] to i32
 ; SSE2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; SSE2-NEXT:    [[TMP22:%.*]] = or i64 [[TMP19]], 1
-; SSE2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
-; SSE2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; SSE2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP23]] to i32
-; SSE2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
-; SSE2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; SSE2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP24]] to i32
+; SSE2-NEXT:    [[TMP25:%.*]] = or i64 [[TMP22]], 1
+; SSE2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]]
+; SSE2-NEXT:    [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; SSE2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP26]] to i32
+; SSE2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]]
+; SSE2-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; SSE2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP27]] to i32
 ; SSE2-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; SSE2-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
 ; SSE2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
@@ -131,25 +131,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE41-NEXT:    [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
 ; SSE41-NEXT:    [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP10]]
 ; SSE41-NEXT:    [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP11]]
-; SSE41-NEXT:    [[TMP22:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
-; SSE41-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; SSE41-NEXT:    [[TMP24:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; SSE41-NEXT:    [[TMP25:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; SSE41-NEXT:    [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP24]], [[TMP22]]
-; SSE41-NEXT:    [[TMP27:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP23]]
-; SSE41-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP26]], [[TMP20]]
-; SSE41-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP21]]
-; SSE41-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; SSE41-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
-; SSE41-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP30]], i32 0
-; SSE41-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <4 x i32>*
-; SSE41-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP33]], align 4
-; SSE41-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP30]], i32 4
-; SSE41-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <4 x i32>*
-; SSE41-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* [[TMP35]], align 4
+; SSE41-NEXT:    [[TMP26:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
+; SSE41-NEXT:    [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; SSE41-NEXT:    [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; SSE41-NEXT:    [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; SSE41-NEXT:    [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP26]]
+; SSE41-NEXT:    [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]]
+; SSE41-NEXT:    [[TMP34:%.*]] = add nsw <4 x i32> [[TMP32]], [[TMP20]]
+; SSE41-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[TMP33]], [[TMP21]]
+; SSE41-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; SSE41-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
+; SSE41-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
+; SSE41-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; SSE41-NEXT:    store <4 x i32> [[TMP34]], <4 x i32>* [[TMP39]], align 4
+; SSE41-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 4
+; SSE41-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
+; SSE41-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP41]], align 4
 ; SSE41-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SSE41-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE41-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE41-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SSE41-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE41:       middle.block:
 ; SSE41-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; SSE41-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -158,21 +158,21 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE41-NEXT:    br label [[FOR_BODY:%.*]]
 ; SSE41:       for.body:
 ; SSE41-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE41-NEXT:    [[TMP37:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; SSE41-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP37]]
-; SSE41-NEXT:    [[TMP38:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; SSE41-NEXT:    [[CONV:%.*]] = sext i16 [[TMP38]] to i32
-; SSE41-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP37]]
-; SSE41-NEXT:    [[TMP39:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; SSE41-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP39]] to i32
+; SSE41-NEXT:    [[TMP43:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; SSE41-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP43]]
+; SSE41-NEXT:    [[TMP44:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; SSE41-NEXT:    [[CONV:%.*]] = sext i16 [[TMP44]] to i32
+; SSE41-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP43]]
+; SSE41-NEXT:    [[TMP45:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; SSE41-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP45]] to i32
 ; SSE41-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; SSE41-NEXT:    [[TMP40:%.*]] = or i64 [[TMP37]], 1
-; SSE41-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP40]]
-; SSE41-NEXT:    [[TMP41:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; SSE41-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP41]] to i32
-; SSE41-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP40]]
-; SSE41-NEXT:    [[TMP42:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; SSE41-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP42]] to i32
+; SSE41-NEXT:    [[TMP46:%.*]] = or i64 [[TMP43]], 1
+; SSE41-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP46]]
+; SSE41-NEXT:    [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; SSE41-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP47]] to i32
+; SSE41-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP46]]
+; SSE41-NEXT:    [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; SSE41-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP48]] to i32
 ; SSE41-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; SSE41-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
 ; SSE41-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
@@ -267,41 +267,41 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX1-NEXT:    [[TMP41:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP21]]
 ; AVX1-NEXT:    [[TMP42:%.*]] = mul nsw <4 x i32> [[TMP38]], [[TMP22]]
 ; AVX1-NEXT:    [[TMP43:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP23]]
-; AVX1-NEXT:    [[TMP44:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
-; AVX1-NEXT:    [[TMP45:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
-; AVX1-NEXT:    [[TMP46:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; AVX1-NEXT:    [[TMP47:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT:    [[TMP48:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
-; AVX1-NEXT:    [[TMP49:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
-; AVX1-NEXT:    [[TMP50:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
-; AVX1-NEXT:    [[TMP51:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
-; AVX1-NEXT:    [[TMP52:%.*]] = mul nsw <4 x i32> [[TMP48]], [[TMP44]]
-; AVX1-NEXT:    [[TMP53:%.*]] = mul nsw <4 x i32> [[TMP49]], [[TMP45]]
-; AVX1-NEXT:    [[TMP54:%.*]] = mul nsw <4 x i32> [[TMP50]], [[TMP46]]
-; AVX1-NEXT:    [[TMP55:%.*]] = mul nsw <4 x i32> [[TMP51]], [[TMP47]]
-; AVX1-NEXT:    [[TMP56:%.*]] = add nsw <4 x i32> [[TMP52]], [[TMP40]]
-; AVX1-NEXT:    [[TMP57:%.*]] = add nsw <4 x i32> [[TMP53]], [[TMP41]]
-; AVX1-NEXT:    [[TMP58:%.*]] = add nsw <4 x i32> [[TMP54]], [[TMP42]]
-; AVX1-NEXT:    [[TMP59:%.*]] = add nsw <4 x i32> [[TMP55]], [[TMP43]]
-; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 0
-; AVX1-NEXT:    [[TMP65:%.*]] = bitcast i32* [[TMP64]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP56]], <4 x i32>* [[TMP65]], align 4
-; AVX1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 4
-; AVX1-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP57]], <4 x i32>* [[TMP67]], align 4
-; AVX1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 8
-; AVX1-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP58]], <4 x i32>* [[TMP69]], align 4
-; AVX1-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 12
-; AVX1-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP59]], <4 x i32>* [[TMP71]], align 4
+; AVX1-NEXT:    [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; AVX1-NEXT:    [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT:    [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; AVX1-NEXT:    [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; AVX1-NEXT:    [[TMP60:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
+; AVX1-NEXT:    [[TMP61:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
+; AVX1-NEXT:    [[TMP62:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
+; AVX1-NEXT:    [[TMP63:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
+; AVX1-NEXT:    [[TMP64:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP52]]
+; AVX1-NEXT:    [[TMP65:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP53]]
+; AVX1-NEXT:    [[TMP66:%.*]] = mul nsw <4 x i32> [[TMP62]], [[TMP54]]
+; AVX1-NEXT:    [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP63]], [[TMP55]]
+; AVX1-NEXT:    [[TMP68:%.*]] = add nsw <4 x i32> [[TMP64]], [[TMP40]]
+; AVX1-NEXT:    [[TMP69:%.*]] = add nsw <4 x i32> [[TMP65]], [[TMP41]]
+; AVX1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[TMP66]], [[TMP42]]
+; AVX1-NEXT:    [[TMP71:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP43]]
+; AVX1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 0
+; AVX1-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP68]], <4 x i32>* [[TMP77]], align 4
+; AVX1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 4
+; AVX1-NEXT:    [[TMP79:%.*]] = bitcast i32* [[TMP78]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP69]], <4 x i32>* [[TMP79]], align 4
+; AVX1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 8
+; AVX1-NEXT:    [[TMP81:%.*]] = bitcast i32* [[TMP80]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP81]], align 4
+; AVX1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 12
+; AVX1-NEXT:    [[TMP83:%.*]] = bitcast i32* [[TMP82]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP71]], <4 x i32>* [[TMP83]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP72]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -310,21 +310,21 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX1-NEXT:    [[TMP73:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP73]]
-; AVX1-NEXT:    [[TMP74:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; AVX1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP74]] to i32
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP73]]
-; AVX1-NEXT:    [[TMP75:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; AVX1-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP75]] to i32
+; AVX1-NEXT:    [[TMP85:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP85]]
+; AVX1-NEXT:    [[TMP86:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; AVX1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP86]] to i32
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP85]]
+; AVX1-NEXT:    [[TMP87:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; AVX1-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP87]] to i32
 ; AVX1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; AVX1-NEXT:    [[TMP76:%.*]] = or i64 [[TMP73]], 1
-; AVX1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP76]]
-; AVX1-NEXT:    [[TMP77:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; AVX1-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP77]] to i32
-; AVX1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP76]]
-; AVX1-NEXT:    [[TMP78:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; AVX1-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP78]] to i32
+; AVX1-NEXT:    [[TMP88:%.*]] = or i64 [[TMP85]], 1
+; AVX1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP88]]
+; AVX1-NEXT:    [[TMP89:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; AVX1-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP89]] to i32
+; AVX1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP88]]
+; AVX1-NEXT:    [[TMP90:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; AVX1-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP90]] to i32
 ; AVX1-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; AVX1-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
 ; AVX1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
@@ -368,17 +368,17 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32>
 ; AVX2-NEXT:    [[TMP10:%.*]] = mul nsw <8 x i32> [[TMP9]], [[TMP5]]
-; AVX2-NEXT:    [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
-; AVX2-NEXT:    [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
-; AVX2-NEXT:    [[TMP13:%.*]] = mul nsw <8 x i32> [[TMP12]], [[TMP11]]
-; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <8 x i32> [[TMP13]], [[TMP10]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
-; AVX2-NEXT:    store <8 x i32> [[TMP14]], <8 x i32>* [[TMP17]], align 4
+; AVX2-NEXT:    [[TMP13:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
+; AVX2-NEXT:    [[TMP15:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
+; AVX2-NEXT:    [[TMP16:%.*]] = mul nsw <8 x i32> [[TMP15]], [[TMP13]]
+; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP16]], [[TMP10]]
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
+; AVX2-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP20]], align 4
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -387,21 +387,21 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX2-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP19]]
-; AVX2-NEXT:    [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; AVX2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP20]] to i32
-; AVX2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; AVX2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP21]] to i32
+; AVX2-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
+; AVX2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; AVX2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP23]] to i32
+; AVX2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; AVX2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP24]] to i32
 ; AVX2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; AVX2-NEXT:    [[TMP22:%.*]] = or i64 [[TMP19]], 1
-; AVX2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
-; AVX2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; AVX2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP23]] to i32
-; AVX2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; AVX2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP24]] to i32
+; AVX2-NEXT:    [[TMP25:%.*]] = or i64 [[TMP22]], 1
+; AVX2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; AVX2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP26]] to i32
+; AVX2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; AVX2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP27]] to i32
 ; AVX2-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; AVX2-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
 ; AVX2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
index c08c7a758bb22..4ad89a424bc0f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
@@ -9,55 +9,11 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64
-; CHECK-NEXT:    [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 10
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 128
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP4]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 0, i64 128, i64 256, i64 384>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 512, i64 640, i64 768, i64 896>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1024, i64 1152, i64 1280, i64 1408>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1536, i64 1664, i64 1792, i64 1920>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64*> [[TMP5]] to <4 x %0**>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64*> [[TMP6]] to <4 x %0**>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %0**>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %0**>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP9]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP10]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 2048
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 128
-; CHECK-NEXT:    [[P3:%.*]] = bitcast i64* [[P2]] to %0**
-; CHECK-NEXT:    [[V:%.*]] = load %0*, %0** [[P3]], align 8
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]]
-; CHECK-NEXT:    br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK: vector.body:
+; CHECK:         [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
 ;
 entry:
   br label %loop
@@ -76,55 +32,11 @@ exit:
 
 define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 {
 ; CHECK-LABEL: @bar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64
-; CHECK-NEXT:    [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 10
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 128
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP4]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 0, i64 128, i64 256, i64 384>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 512, i64 640, i64 768, i64 896>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1024, i64 1152, i64 1280, i64 1408>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1536, i64 1664, i64 1792, i64 1920>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64*> [[TMP5]] to <4 x %1**>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64*> [[TMP6]] to <4 x %1**>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %1**>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %1**>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP9]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP10]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 2048
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 128
-; CHECK-NEXT:    [[P3:%.*]] = bitcast i64* [[P2]] to %1**
-; CHECK-NEXT:    [[V:%.*]] = load %1*, %1** [[P3]], align 8
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]]
-; CHECK-NEXT:    br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK: vector.body:
+; CHECK:         [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
index 783b2dc23c1fd..4d5ca19a71168 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -mcpu=prescott -disable-basic-aa < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
@@ -7,78 +6,9 @@ target triple = "i386-apple-darwin"
 ; PR15344
 define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG2:%.*]] = bitcast float* [[ARG:%.*]] to i8*
-; CHECK-NEXT:    br label [[BB2:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP:%.*]] = load double, double* null, align 8
-; CHECK-NEXT:    br i1 undef, label [[BB3_PREHEADER:%.*]], label [[BB12:%.*]]
-; CHECK:       bb3.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[ARG1:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[ARG]], i32 [[ARG1]]
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[ARG1]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[ARG1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[TMP]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP5]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP9]] = fadd fast <2 x double> [[VEC_PHI]], undef
-; CHECK-NEXT:    [[TMP10]] = fadd fast <2 x double> [[VEC_PHI4]], undef
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[ARG]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[ARG]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> undef, <2 x float>* [[TMP14]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP15]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> undef, <2 x float>* [[TMP16]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[ARG1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB12_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP]], [[VECTOR_MEMCHECK]] ], [ [[TMP]], [[BB3_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[TMP9:%.*]], [[BB3]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP8:%.*]], [[BB3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8]] = add nsw i32 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP9]] = fadd fast double [[TMP4]], undef
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[ARG]], i32 [[TMP5]]
-; CHECK-NEXT:    store float undef, float* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP8]], [[ARG1]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[BB12_LOOPEXIT]], label [[BB3]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       bb12.loopexit:
-; CHECK-NEXT:    [[TMP9_LCSSA:%.*]] = phi double [ [[TMP9]], [[BB3]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[BB12]]
-; CHECK:       bb12:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi double [ [[TMP]], [[BB2]] ], [ [[TMP9_LCSSA]], [[BB12_LOOPEXIT]] ]
-; CHECK-NEXT:    ret void
-;
+; CHECK: preheader
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: vector.memcheck
 
 bb:
   br label %bb2

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index c2fa294b79db4..62b231a0d80e6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -73,7 +73,7 @@ define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
 ; CHECK-NEXT:    [[TMP9]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
@@ -91,7 +91,7 @@ define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
 ; CHECK-NEXT:    [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       loop.exit.loopexit:
 ; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
@@ -145,7 +145,7 @@ define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
 ; CHECK-NEXT:    [[TMP9]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
@@ -163,7 +163,7 @@ define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
 ; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       loop.exit.loopexit:
 ; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
@@ -217,7 +217,7 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %arra
 ; CHECK-NEXT:    [[TMP9]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
@@ -235,7 +235,7 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %arra
 ; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       loop.exit.loopexit:
 ; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
@@ -298,7 +298,7 @@ define float @PR35538(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]
@@ -324,7 +324,7 @@ define float @PR35538(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
 ;
 entry:
   %cmp12 = icmp sgt i32 %N, 0
@@ -385,7 +385,7 @@ define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]
@@ -411,7 +411,7 @@ define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
 ;
 entry:
   %cmp12 = icmp sgt i32 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index e86bd95a34e27..7c29faa51e64a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s
 
@@ -46,66 +45,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
 ;
 define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
-; CHECK-LABEL: @reduction_i8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_12:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP_12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i8> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8]] = add <4 x i8> [[TMP7]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.for.cond.cleanup_crit_edge:
-; CHECK-NEXT:    [[ADD5_LCSSA:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[CONV6:%.*]] = trunc i32 [[ADD5_LCSSA]] to i8
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i8 [ [[CONV6]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i8 [[SUM_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_013:%.*]] = phi i32 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP12]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP13]] to i32
-; CHECK-NEXT:    [[CONV4:%.*]] = and i32 [[SUM_013]], 255
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV4]], [[CONV]]
-; CHECK-NEXT:    [[ADD5]] = add nuw nsw i32 [[ADD]], [[CONV3]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
 entry:
   %cmp.12 = icmp sgt i32 %n, 0
   br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
index ae7d152cee663..b9598cf87c506 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
 ; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
 ; REQUIRES: asserts
@@ -10,212 +9,17 @@ define i32 @foo() {
 ; This function has a loop of SAD pattern. Here we check when VF = 16 the
 ; register usage doesn't exceed 16.
 ;
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <16 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <16 x i32> [[TMP9]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP9]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP13]] = add <16 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <8 x i32> [ [[TMP16]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <8 x i8> [[WIDE_LOAD7]] to <8 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <8 x i32> [[TMP21]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp sgt <8 x i32> [[TMP26]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP26]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP27]], <8 x i32> [[TMP26]], <8 x i32> [[TMP28]]
-; CHECK-NEXT:    [[TMP30]] = add <8 x i32> [[TMP29]], [[VEC_PHI5]]
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1024
-; CHECK-NEXT:    br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP30]])
-; CHECK-NEXT:    [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N3]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP32]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA2:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP32]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA2]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP34]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; CHECK-NEXT:    [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP35]], [[S_015]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; AVX512F-LABEL: @foo(
-; AVX512F-NEXT:  iter.check:
-; AVX512F-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX512F:       vector.main.loop.iter.check:
-; AVX512F-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F:       vector.ph:
-; AVX512F-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX512F:       vector.body:
-; AVX512F-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI1:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 64
-; AVX512F-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; AVX512F-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP5]], align 1
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 64
-; AVX512F-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP7]], align 1
-; AVX512F-NEXT:    [[TMP8:%.*]] = zext <64 x i8> [[WIDE_LOAD]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP9:%.*]] = zext <64 x i8> [[WIDE_LOAD2]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP1]]
-; AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0
-; AVX512F-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD3:%.*]] = load <64 x i8>, <64 x i8>* [[TMP13]], align 1
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 64
-; AVX512F-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD4:%.*]] = load <64 x i8>, <64 x i8>* [[TMP15]], align 1
-; AVX512F-NEXT:    [[TMP16:%.*]] = zext <64 x i8> [[WIDE_LOAD3]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP17:%.*]] = zext <64 x i8> [[WIDE_LOAD4]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP18:%.*]] = sub nsw <64 x i32> [[TMP8]], [[TMP16]]
-; AVX512F-NEXT:    [[TMP19:%.*]] = sub nsw <64 x i32> [[TMP9]], [[TMP17]]
-; AVX512F-NEXT:    [[TMP20:%.*]] = icmp sgt <64 x i32> [[TMP18]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT:    [[TMP21:%.*]] = icmp sgt <64 x i32> [[TMP19]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT:    [[TMP22:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP18]]
-; AVX512F-NEXT:    [[TMP23:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP19]]
-; AVX512F-NEXT:    [[TMP24:%.*]] = select <64 x i1> [[TMP20]], <64 x i32> [[TMP18]], <64 x i32> [[TMP22]]
-; AVX512F-NEXT:    [[TMP25:%.*]] = select <64 x i1> [[TMP21]], <64 x i32> [[TMP19]], <64 x i32> [[TMP23]]
-; AVX512F-NEXT:    [[TMP26]] = add <64 x i32> [[TMP24]], [[VEC_PHI]]
-; AVX512F-NEXT:    [[TMP27]] = add <64 x i32> [[TMP25]], [[VEC_PHI1]]
-; AVX512F-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
-; AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX512F-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; AVX512F:       middle.block:
-; AVX512F-NEXT:    [[BIN_RDX:%.*]] = add <64 x i32> [[TMP27]], [[TMP26]]
-; AVX512F-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[BIN_RDX]])
-; AVX512F-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX512F:       vec.epilog.iter.check:
-; AVX512F-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; AVX512F:       vec.epilog.ph:
-; AVX512F-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP29]], [[VEC_EPILOG_ITER_CHECK]] ]
-; AVX512F-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX512F-NEXT:    [[TMP30:%.*]] = insertelement <32 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; AVX512F-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX512F:       vec.epilog.vector.body:
-; AVX512F-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI8:%.*]] = phi <32 x i32> [ [[TMP30]], [[VEC_EPILOG_PH]] ], [ [[TMP44:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP31:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512F-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP31]]
-; AVX512F-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[TMP32]], i32 0
-; AVX512F-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP33]] to <32 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, <32 x i8>* [[TMP34]], align 1
-; AVX512F-NEXT:    [[TMP35:%.*]] = zext <32 x i8> [[WIDE_LOAD9]] to <32 x i32>
-; AVX512F-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP31]]
-; AVX512F-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, i8* [[TMP36]], i32 0
-; AVX512F-NEXT:    [[TMP38:%.*]] = bitcast i8* [[TMP37]] to <32 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, <32 x i8>* [[TMP38]], align 1
-; AVX512F-NEXT:    [[TMP39:%.*]] = zext <32 x i8> [[WIDE_LOAD10]] to <32 x i32>
-; AVX512F-NEXT:    [[TMP40:%.*]] = sub nsw <32 x i32> [[TMP35]], [[TMP39]]
-; AVX512F-NEXT:    [[TMP41:%.*]] = icmp sgt <32 x i32> [[TMP40]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT:    [[TMP42:%.*]] = sub nsw <32 x i32> zeroinitializer, [[TMP40]]
-; AVX512F-NEXT:    [[TMP43:%.*]] = select <32 x i1> [[TMP41]], <32 x i32> [[TMP40]], <32 x i32> [[TMP42]]
-; AVX512F-NEXT:    [[TMP44]] = add <32 x i32> [[TMP43]], [[VEC_PHI8]]
-; AVX512F-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 32
-; AVX512F-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 1024
-; AVX512F-NEXT:    br i1 [[TMP45]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; AVX512F:       vec.epilog.middle.block:
-; AVX512F-NEXT:    [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP44]])
-; AVX512F-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT:    br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX512F:       vec.epilog.scalar.ph:
-; AVX512F-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; AVX512F-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP29]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP46]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX512F:       for.cond.cleanup.loopexit:
-; AVX512F-NEXT:    [[ADD_LCSSA5:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP46]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT:    br label [[FOR_COND_CLEANUP]]
-; AVX512F:       for.cond.cleanup:
-; AVX512F-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; AVX512F-NEXT:    ret i32 [[ADD_LCSSA]]
-; AVX512F:       for.body:
-; AVX512F-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT:    [[TMP47:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512F-NEXT:    [[CONV:%.*]] = zext i8 [[TMP47]] to i32
-; AVX512F-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT:    [[TMP48:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; AVX512F-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP48]] to i32
-; AVX512F-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; AVX512F-NEXT:    [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; AVX512F-NEXT:    [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; AVX512F-NEXT:    [[TMP49:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; AVX512F-NEXT:    [[ADD]] = add nsw i32 [[TMP49]], [[S_015]]
-; AVX512F-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512F-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; AVX512F-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK-LABEL: foo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
 
 entry:
   br label %for.body
@@ -247,226 +51,17 @@ define i32 @goo() {
 ; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
 ; it will not have vector version and the vector register usage will not exceed the
 ; available vector register number.
-; CHECK-LABEL: @goo(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <16 x i32> [[TMP11]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP11]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP15]] = add <16 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <8 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[TMP19]], 3
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = add nsw i64 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = zext <8 x i8> [[WIDE_LOAD7]] to <8 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <8 x i32> [[TMP24]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt <8 x i32> [[TMP30]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP32:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP31]], <8 x i32> [[TMP30]], <8 x i32> [[TMP32]]
-; CHECK-NEXT:    [[TMP34]] = add <8 x i32> [[TMP33]], [[VEC_PHI5]]
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1024
-; CHECK-NEXT:    br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
-; CHECK-NEXT:    [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N3]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP36]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA2:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP36]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA2]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP3]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; CHECK-NEXT:    [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP4]], [[S_015]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; AVX512F-LABEL: @goo(
-; AVX512F-NEXT:  iter.check:
-; AVX512F-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX512F:       vector.main.loop.iter.check:
-; AVX512F-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F:       vector.ph:
-; AVX512F-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX512F:       vector.body:
-; AVX512F-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI1:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 64
-; AVX512F-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], 3
-; AVX512F-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP1]], 3
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP2]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP3]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX512F-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP7]], align 1
-; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 64
-; AVX512F-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP9]], align 1
-; AVX512F-NEXT:    [[TMP10:%.*]] = zext <64 x i8> [[WIDE_LOAD]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP11:%.*]] = zext <64 x i8> [[WIDE_LOAD2]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP0]], 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP1]], 2
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP12]]
-; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP13]]
-; AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
-; AVX512F-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD3:%.*]] = load <64 x i8>, <64 x i8>* [[TMP17]], align 1
-; AVX512F-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 64
-; AVX512F-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <64 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD4:%.*]] = load <64 x i8>, <64 x i8>* [[TMP19]], align 1
-; AVX512F-NEXT:    [[TMP20:%.*]] = zext <64 x i8> [[WIDE_LOAD3]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP21:%.*]] = zext <64 x i8> [[WIDE_LOAD4]] to <64 x i32>
-; AVX512F-NEXT:    [[TMP22:%.*]] = sub nsw <64 x i32> [[TMP10]], [[TMP20]]
-; AVX512F-NEXT:    [[TMP23:%.*]] = sub nsw <64 x i32> [[TMP11]], [[TMP21]]
-; AVX512F-NEXT:    [[TMP24:%.*]] = icmp sgt <64 x i32> [[TMP22]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT:    [[TMP25:%.*]] = icmp sgt <64 x i32> [[TMP23]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT:    [[TMP26:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP22]]
-; AVX512F-NEXT:    [[TMP27:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP23]]
-; AVX512F-NEXT:    [[TMP28:%.*]] = select <64 x i1> [[TMP24]], <64 x i32> [[TMP22]], <64 x i32> [[TMP26]]
-; AVX512F-NEXT:    [[TMP29:%.*]] = select <64 x i1> [[TMP25]], <64 x i32> [[TMP23]], <64 x i32> [[TMP27]]
-; AVX512F-NEXT:    [[TMP30]] = add <64 x i32> [[TMP28]], [[VEC_PHI]]
-; AVX512F-NEXT:    [[TMP31]] = add <64 x i32> [[TMP29]], [[VEC_PHI1]]
-; AVX512F-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
-; AVX512F-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX512F-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; AVX512F:       middle.block:
-; AVX512F-NEXT:    [[BIN_RDX:%.*]] = add <64 x i32> [[TMP31]], [[TMP30]]
-; AVX512F-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[BIN_RDX]])
-; AVX512F-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX512F:       vec.epilog.iter.check:
-; AVX512F-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; AVX512F:       vec.epilog.ph:
-; AVX512F-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ]
-; AVX512F-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX512F-NEXT:    [[TMP34:%.*]] = insertelement <32 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; AVX512F-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX512F:       vec.epilog.vector.body:
-; AVX512F-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI8:%.*]] = phi <32 x i32> [ [[TMP34]], [[VEC_EPILOG_PH]] ], [ [[TMP50:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512F-NEXT:    [[TMP36:%.*]] = add nsw i64 [[TMP35]], 3
-; AVX512F-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP36]]
-; AVX512F-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[TMP37]], i32 0
-; AVX512F-NEXT:    [[TMP39:%.*]] = bitcast i8* [[TMP38]] to <32 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, <32 x i8>* [[TMP39]], align 1
-; AVX512F-NEXT:    [[TMP40:%.*]] = zext <32 x i8> [[WIDE_LOAD9]] to <32 x i32>
-; AVX512F-NEXT:    [[TMP41:%.*]] = add nsw i64 [[TMP35]], 2
-; AVX512F-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP41]]
-; AVX512F-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, i8* [[TMP42]], i32 0
-; AVX512F-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP43]] to <32 x i8>*
-; AVX512F-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, <32 x i8>* [[TMP44]], align 1
-; AVX512F-NEXT:    [[TMP45:%.*]] = zext <32 x i8> [[WIDE_LOAD10]] to <32 x i32>
-; AVX512F-NEXT:    [[TMP46:%.*]] = sub nsw <32 x i32> [[TMP40]], [[TMP45]]
-; AVX512F-NEXT:    [[TMP47:%.*]] = icmp sgt <32 x i32> [[TMP46]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT:    [[TMP48:%.*]] = sub nsw <32 x i32> zeroinitializer, [[TMP46]]
-; AVX512F-NEXT:    [[TMP49:%.*]] = select <32 x i1> [[TMP47]], <32 x i32> [[TMP46]], <32 x i32> [[TMP48]]
-; AVX512F-NEXT:    [[TMP50]] = add <32 x i32> [[TMP49]], [[VEC_PHI8]]
-; AVX512F-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 32
-; AVX512F-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 1024
-; AVX512F-NEXT:    br i1 [[TMP51]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; AVX512F:       vec.epilog.middle.block:
-; AVX512F-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP50]])
-; AVX512F-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT:    br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX512F:       vec.epilog.scalar.ph:
-; AVX512F-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; AVX512F-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP52]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX512F:       for.cond.cleanup.loopexit:
-; AVX512F-NEXT:    [[ADD_LCSSA5:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP52]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT:    br label [[FOR_COND_CLEANUP]]
-; AVX512F:       for.cond.cleanup:
-; AVX512F-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; AVX512F-NEXT:    ret i32 [[ADD_LCSSA]]
-; AVX512F:       for.body:
-; AVX512F-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], 3
-; AVX512F-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; AVX512F-NEXT:    [[TMP:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512F-NEXT:    [[CONV:%.*]] = zext i8 [[TMP]] to i32
-; AVX512F-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; AVX512F-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP2]]
-; AVX512F-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; AVX512F-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP3]] to i32
-; AVX512F-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; AVX512F-NEXT:    [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; AVX512F-NEXT:    [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; AVX512F-NEXT:    [[ADD]] = add nsw i32 [[TMP4]], [[S_015]]
-; AVX512F-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512F-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; AVX512F-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
 entry:
   br label %for.body
 
@@ -496,144 +91,12 @@ for.body:                                         ; preds = %for.body, %entry
 }
 
 define i64 @bar(i64* nocapture %a) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[WIDE_LOAD3]], [[STEP_ADD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP8]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP13]] = add <2 x i64> [[TMP9]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i64 [ [[ADD2:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[ADD2_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_012:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[S_011:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_012]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP16]], [[I_012]]
-; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD2]] = add nsw i64 [[ADD]], [[S_011]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_012]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; AVX512F-LABEL: @bar(
-; AVX512F-NEXT:  entry:
-; AVX512F-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F:       vector.ph:
-; AVX512F-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX512F:       vector.body:
-; AVX512F-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI4:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI5:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[VEC_PHI6:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT:    [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT:    [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; AVX512F-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; AVX512F-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP2]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP3]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; AVX512F-NEXT:    [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <8 x i64>*
-; AVX512F-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, <8 x i64>* [[TMP9]], align 8
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 8
-; AVX512F-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <8 x i64>*
-; AVX512F-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i64>, <8 x i64>* [[TMP11]], align 8
-; AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 16
-; AVX512F-NEXT:    [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <8 x i64>*
-; AVX512F-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i64>, <8 x i64>* [[TMP13]], align 8
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 24
-; AVX512F-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <8 x i64>*
-; AVX512F-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x i64>, <8 x i64>* [[TMP15]], align 8
-; AVX512F-NEXT:    [[TMP16:%.*]] = add nsw <8 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; AVX512F-NEXT:    [[TMP17:%.*]] = add nsw <8 x i64> [[WIDE_LOAD7]], [[STEP_ADD]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = add nsw <8 x i64> [[WIDE_LOAD8]], [[STEP_ADD1]]
-; AVX512F-NEXT:    [[TMP19:%.*]] = add nsw <8 x i64> [[WIDE_LOAD9]], [[STEP_ADD2]]
-; AVX512F-NEXT:    [[TMP20:%.*]] = bitcast i64* [[TMP8]] to <8 x i64>*
-; AVX512F-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[TMP20]], align 8
-; AVX512F-NEXT:    [[TMP21:%.*]] = bitcast i64* [[TMP10]] to <8 x i64>*
-; AVX512F-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[TMP21]], align 8
-; AVX512F-NEXT:    [[TMP22:%.*]] = bitcast i64* [[TMP12]] to <8 x i64>*
-; AVX512F-NEXT:    store <8 x i64> [[TMP18]], <8 x i64>* [[TMP22]], align 8
-; AVX512F-NEXT:    [[TMP23:%.*]] = bitcast i64* [[TMP14]] to <8 x i64>*
-; AVX512F-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[TMP23]], align 8
-; AVX512F-NEXT:    [[TMP24]] = add <8 x i64> [[TMP16]], [[VEC_PHI]]
-; AVX512F-NEXT:    [[TMP25]] = add <8 x i64> [[TMP17]], [[VEC_PHI4]]
-; AVX512F-NEXT:    [[TMP26]] = add <8 x i64> [[TMP18]], [[VEC_PHI5]]
-; AVX512F-NEXT:    [[TMP27]] = add <8 x i64> [[TMP19]], [[VEC_PHI6]]
-; AVX512F-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX512F-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX512F-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; AVX512F:       middle.block:
-; AVX512F-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP25]], [[TMP24]]
-; AVX512F-NEXT:    [[BIN_RDX10:%.*]] = add <8 x i64> [[TMP26]], [[BIN_RDX]]
-; AVX512F-NEXT:    [[BIN_RDX11:%.*]] = add <8 x i64> [[TMP27]], [[BIN_RDX10]]
-; AVX512F-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX11]])
-; AVX512F-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; AVX512F:       scalar.ph:
-; AVX512F-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; AVX512F-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; AVX512F-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX512F:       for.cond.cleanup:
-; AVX512F-NEXT:    [[ADD2_LCSSA:%.*]] = phi i64 [ [[ADD2:%.*]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; AVX512F-NEXT:    ret i64 [[ADD2_LCSSA]]
-; AVX512F:       for.body:
-; AVX512F-NEXT:    [[I_012:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[S_011:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_012]]
-; AVX512F-NEXT:    [[TMP30:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; AVX512F-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP30]], [[I_012]]
-; AVX512F-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX]], align 8
-; AVX512F-NEXT:    [[ADD2]] = add nsw i64 [[ADD]], [[S_011]]
-; AVX512F-NEXT:    [[INC]] = add nuw nsw i64 [[I_012]], 1
-; AVX512F-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], 1024
-; AVX512F-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
+; CHECK-LABEL: bar
+; CHECK:       LV(REG): VF = 2
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 3 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
 
 entry:
   br label %for.body
@@ -663,92 +126,8 @@ define void @hoo(i32 %n) {
 ; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
 ; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
 ; so the max usage of AVX512 vector register will be 2.
-; CHECK-LABEL: @hoo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-; AVX512F-LABEL: @hoo(
-; AVX512F-NEXT:  iter.check:
-; AVX512F-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX512F:       vector.main.loop.iter.check:
-; AVX512F-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F:       vector.ph:
-; AVX512F-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX512F:       vector.body:
-; AVX512F-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
-; AVX512F-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <16 x i64>*
-; AVX512F-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i64>, <16 x i64>* [[TMP3]], align 8
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, <16 x i64> [[WIDE_LOAD]]
-; AVX512F-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP4]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; AVX512F-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
-; AVX512F-NEXT:    store <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32>* [[TMP7]], align 4
-; AVX512F-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX512F-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; AVX512F:       middle.block:
-; AVX512F-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
-; AVX512F-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX512F:       vec.epilog.iter.check:
-; AVX512F-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; AVX512F:       vec.epilog.ph:
-; AVX512F-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX512F-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX512F:       vec.epilog.vector.body:
-; AVX512F-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[TMP9]]
-; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
-; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <8 x i64>*
-; AVX512F-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i64>, <8 x i64>* [[TMP12]], align 8
-; AVX512F-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, <8 x i64> [[WIDE_LOAD3]]
-; AVX512F-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP13]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
-; AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[TMP9]]
-; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
-; AVX512F-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER4]], <8 x i32>* [[TMP16]], align 4
-; AVX512F-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[OFFSET_IDX]], 8
-; AVX512F-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 10000
-; AVX512F-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; AVX512F:       vec.epilog.middle.block:
-; AVX512F-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 10000, 10000
-; AVX512F-NEXT:    br i1 [[CMP_N1]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX512F:       vec.epilog.scalar.ph:
-; AVX512F-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; AVX512F-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX512F:       for.body:
-; AVX512F-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT:    [[TMP:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; AVX512F-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP]]
-; AVX512F-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; AVX512F-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT:    store i32 [[TMP1]], i32* [[ARRAYIDX3]], align 4
-; AVX512F-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512F-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512F-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; AVX512F:       for.end.loopexit:
-; AVX512F-NEXT:    br label [[FOR_END]]
-; AVX512F:       for.end:
-; AVX512F-NEXT:    ret void
-;
+; AVX512F-LABEL: bar
+; AVX512F:       LV(REG): VF = 16
 ; AVX512F-CHECK: LV(REG): Found max usage: 2 item
 ; AVX512F-CHECK: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; AVX512F-CHECK: LV(REG): RegisterClass: Generic::VectorRC, 2 registers

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll b/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
index 9572c1f6c261c..1add87db611e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
@@ -1,29 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -instcombine -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca float, align 4
-; CHECK-NEXT:    br label [[LOOP_EXIT_DIM_11_CRITEDGE:%.*]]
-; CHECK:       loop_exit.dim.11.critedge:
-; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[ALLOCA]] to i64
-; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 4
-; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; CHECK-NEXT:    br label [[LOOP_HEADER_DIM_017_PREHEADER:%.*]]
-; CHECK:       loop_header.dim.017.preheader.loopexit:
-; CHECK-NEXT:    br label [[LOOP_HEADER_DIM_017_PREHEADER]]
-; CHECK:       loop_header.dim.017.preheader:
-; CHECK-NEXT:    br label [[LOOP_BODY_DIM_018:%.*]]
-; CHECK:       loop_body.dim.018:
-; CHECK-NEXT:    [[INVAR_ADDRESS_DIM_019_0135:%.*]] = phi i64 [ 0, [[LOOP_HEADER_DIM_017_PREHEADER]] ], [ [[TMP0:%.*]], [[LOOP_BODY_DIM_018]] ]
-; CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[INVAR_ADDRESS_DIM_019_0135]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 256
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_HEADER_DIM_017_PREHEADER_LOOPEXIT:%.*]], label [[LOOP_BODY_DIM_018]]
-;
 entry:
   %alloca = alloca float, align 4
   br label %loop_exit.dim.11.critedge
@@ -40,6 +19,8 @@ loop_header.dim.017.preheader:                    ; preds = %loop_exit.dim.016,
 loop_body.dim.018:                                ; preds = %loop_body.dim.018, %loop_header.dim.017.preheader
   %invar_address.dim.019.0135 = phi i64 [ 0, %loop_header.dim.017.preheader ], [ %0, %loop_body.dim.018 ]
   call void @llvm.assume(i1 %maskcond)
+; CHECK:     call void @llvm.assume(
+; CHECK-NOT: call void @llvm.assume(
   %0 = add nuw nsw i64 %invar_address.dim.019.0135, 1
   %1 = icmp eq i64 %0, 256
   br i1 %1, label %loop_header.dim.017.preheader, label %loop_body.dim.018

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 476f0513eaefc..97a1365e4f084 100755
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -15,151 +15,23 @@ target triple = "x86_64-apple-macosx10.11.0"
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @_Z3fn1v() #0 {
 ; CHECK-LABEL: @_Z3fn1v(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @c, align 4
-; CHECK-NEXT:    [[CMP34:%.*]] = icmp sgt i32 [[TMP0]], 8
-; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @a, align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* @b, align 8
-; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[TMP2]], 4063299859190
-; CHECK-NEXT:    [[TOBOOL6:%.*]] = icmp eq i64 [[MUL]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], -9
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 8, [[TMP7]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <16 x i64> [[TMP8]], [[VEC_IND3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP9]], <16 x i64> [[TMP10]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP11]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <16 x i64> [[TMP8]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP9]], <16 x i64> [[TMP13]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP14]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT99:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 8, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body.us.preheader:
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP3]], -9
-; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP16]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = add nuw i64 [[TMP17]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK8:%.*]] = icmp ult i64 [[TMP18]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK8]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH9:%.*]]
-; CHECK:       vector.ph9:
-; CHECK-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[TMP18]], 16
-; CHECK-NEXT:    [[N_VEC11:%.*]] = sub i64 [[TMP18]], [[N_MOD_VF10]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[N_VEC11]], 2
-; CHECK-NEXT:    [[IND_END13:%.*]] = add i64 8, [[TMP19]]
-; CHECK-NEXT:    [[IND_END15:%.*]] = mul i64 [[N_VEC11]], 2
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY7:%.*]]
-; CHECK:       vector.body7:
-; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT22:%.*]], [[VECTOR_BODY7]] ]
-; CHECK-NEXT:    [[VEC_IND18:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH9]] ], [ [[VEC_IND_NEXT19:%.*]], [[VECTOR_BODY7]] ]
-; CHECK-NEXT:    [[VEC_IND20:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH9]] ], [ [[VEC_IND_NEXT21:%.*]], [[VECTOR_BODY7]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = add nsw <16 x i64> [[TMP20]], [[VEC_IND20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP21]], <16 x i64> [[TMP22]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP23]], i32 16, <16 x i1> [[TMP24]])
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i64> [[VEC_IND20]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <16 x i64> [[TMP20]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP21]], <16 x i64> [[TMP26]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP27]], i32 8, <16 x i1> [[TMP24]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32*> [[TMP23]], i32 16, <16 x i1> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    [[TMP28:%.*]] = or <16 x i64> [[VEC_IND20]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP29:%.*]] = add nsw <16 x i64> [[TMP20]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP21]], <16 x i64> [[TMP29]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32*> [[TMP30]], i32 8, <16 x i1> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX17]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT19]] = add <16 x i64> [[VEC_IND18]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT:    [[VEC_IND_NEXT21]] = add <16 x i64> [[VEC_IND20]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC11]]
-; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY7]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       middle.block5:
-; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[TMP18]], [[N_VEC11]]
-; CHECK-NEXT:    br i1 [[CMP_N16]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
-; CHECK:       scalar.ph6:
-; CHECK-NEXT:    [[BC_RESUME_VAL12:%.*]] = phi i64 [ [[IND_END13]], [[MIDDLE_BLOCK5]] ], [ 8, [[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i64 [ [[IND_END15]], [[MIDDLE_BLOCK5]] ], [ 0, [[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
-; CHECK:       for.body.us:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL12]], [[SCALAR_PH6]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL14]], [[SCALAR_PH6]] ]
-; CHECK-NEXT:    [[TMP32:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
-; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[INDVARS_IV78]]
-; CHECK-NEXT:    [[TMP33:%.*]] = add nsw i64 [[TMP32]], [[INDVARS_IV70]]
-; CHECK-NEXT:    [[ARRAYDECAY_US_US_US:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR_US]], i64 [[TMP33]], i64 0
-; CHECK-NEXT:    br i1 [[TOBOOL6]], label [[FOR_BODY5_US_US_US_PREHEADER:%.*]], label [[FOR_BODY5_US_US48_PREHEADER:%.*]]
-; CHECK:       for.body5.us.us48.preheader:
-; CHECK-NEXT:    store i32 8, i32* [[ARRAYDECAY_US_US_US]], align 16
-; CHECK-NEXT:    [[INDVARS_IV_NEXT66:%.*]] = or i64 [[INDVARS_IV70]], 1
-; CHECK-NEXT:    [[TMP34:%.*]] = add nsw i64 [[TMP32]], [[INDVARS_IV_NEXT66]]
-; CHECK-NEXT:    [[ARRAYDECAY_US_US55_1:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR_US]], i64 [[TMP34]], i64 0
-; CHECK-NEXT:    store i32 8, i32* [[ARRAYDECAY_US_US55_1]], align 8
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP4_US_LCSSA_US_US]]
-; CHECK:       for.body5.us.us.us.preheader:
-; CHECK-NEXT:    store i32 7, i32* [[ARRAYDECAY_US_US_US]], align 16
-; CHECK-NEXT:    [[INDVARS_IV_NEXT73:%.*]] = or i64 [[INDVARS_IV70]], 1
-; CHECK-NEXT:    [[TMP35:%.*]] = add nsw i64 [[TMP32]], [[INDVARS_IV_NEXT73]]
-; CHECK-NEXT:    [[ARRAYDECAY_US_US_US_1:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR_US]], i64 [[TMP35]], i64 0
-; CHECK-NEXT:    store i32 7, i32* [[ARRAYDECAY_US_US_US_1]], align 8
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP4_US_LCSSA_US_US]]
-; CHECK:       for.cond.cleanup4.us-lcssa.us.us:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT79]] = add nuw nsw i64 [[INDVARS_IV78]], 2
-; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT79]], [[TMP3]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT71]] = add nuw nsw i64 [[INDVARS_IV70]], 2
-; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_BODY_US]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup.loopexit99:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV95:%.*]] = phi i64 [ [[INDVARS_IV_NEXT96:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV87:%.*]] = phi i64 [ [[INDVARS_IV_NEXT88:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP36:%.*]] = sub nsw i64 8, [[INDVARS_IV95]]
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[INDVARS_IV95]]
-; CHECK-NEXT:    [[TMP37:%.*]] = add nsw i64 [[TMP36]], [[INDVARS_IV87]]
-; CHECK-NEXT:    [[ARRAYDECAY_US31:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR]], i64 [[TMP37]], i64 0
-; CHECK-NEXT:    store i32 8, i32* [[ARRAYDECAY_US31]], align 16
-; CHECK-NEXT:    [[INDVARS_IV_NEXT90:%.*]] = or i64 [[INDVARS_IV87]], 1
-; CHECK-NEXT:    [[TMP38:%.*]] = add nsw i64 [[TMP36]], [[INDVARS_IV_NEXT90]]
-; CHECK-NEXT:    [[ARRAYDECAY_US31_1:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR]], i64 [[TMP38]], i64 0
-; CHECK-NEXT:    store i32 8, i32* [[ARRAYDECAY_US31_1]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT96]] = add nuw nsw i64 [[INDVARS_IV95]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT96]], [[TMP3]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT88]] = add nuw nsw i64 [[INDVARS_IV87]], 2
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT99]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 ;
 entry:
   %0 = load i32, i32* @c, align 4

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
index 4ca3f9702da5c..cd3e89ae73504 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
 ; REQUIRES: asserts
 ; This test should not be vectorized in X86\SLM arch
@@ -10,41 +9,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
-; MSG-LABEL: @no_vec(
-; MSG-NEXT:  entry:
-; MSG-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[LASTINDEX:%.*]], 0
-; MSG-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; MSG:       for.body.lr.ph:
-; MSG-NEXT:    [[CONV5:%.*]] = sext i16 [[SCALE:%.*]] to i64
-; MSG-NEXT:    [[SH_PROM:%.*]] = and i64 [[CONV5]], 4294967295
-; MSG-NEXT:    [[TMP0:%.*]] = sext i16 [[LAG:%.*]] to i64
-; MSG-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LASTINDEX]] to i64
-; MSG-NEXT:    br label [[FOR_BODY:%.*]]
-; MSG:       for.cond.cleanup.loopexit:
-; MSG-NEXT:    [[ADD7_LCSSA:%.*]] = phi i64 [ [[ADD7:%.*]], [[FOR_BODY]] ]
-; MSG-NEXT:    [[CONV8:%.*]] = trunc i64 [[ADD7_LCSSA]] to i32
-; MSG-NEXT:    br label [[FOR_COND_CLEANUP]]
-; MSG:       for.cond.cleanup:
-; MSG-NEXT:    [[ACCUMULATOR_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CONV8]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; MSG-NEXT:    ret i32 [[ACCUMULATOR_0_LCSSA]]
-; MSG:       for.body:
-; MSG-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; MSG-NEXT:    [[ACCUMULATOR_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
-; MSG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[INPUTDATA:%.*]], i64 [[INDVARS_IV]]
-; MSG-NEXT:    [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; MSG-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i64
-; MSG-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
-; MSG-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[INPUTDATA]], i64 [[TMP2]]
-; MSG-NEXT:    [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2
-; MSG-NEXT:    [[CONV4:%.*]] = sext i16 [[TMP3]] to i64
-; MSG-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV]]
-; MSG-NEXT:    [[SHR:%.*]] = ashr i64 [[MUL]], [[SH_PROM]]
-; MSG-NEXT:    [[ADD7]] = add i64 [[SHR]], [[ACCUMULATOR_018]]
-; MSG-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MSG-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; MSG-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
 entry:
+; MSG: LV: Selecting VF: 1. 
   %cmp17 = icmp sgt i32 %LastIndex, 0
   br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
 
@@ -71,11 +37,11 @@ for.body:                                         ; preds = %for.body, %for.body
   %conv = sext i16 %1 to i64
   %2 = add nsw i64 %indvars.iv, %0
   %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
-  %3 = load i16, i16* %arrayidx3, align 2
+  %3 = load i16, i16* %arrayidx3, align 2 
   %conv4 = sext i16 %3 to i64
   %mul = mul nsw i64 %conv4, %conv
   %shr = ashr i64 %mul, %sh_prom
-  %add7 = add i64 %shr, %Accumulator.018
+  %add7 = add i64 %shr, %Accumulator.018 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
index 08723697421aa..42c280df6ad02 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -49,42 +48,8 @@ declare float @llvm.exp2.f32(float) #0
 
 define void @sin_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -106,42 +71,8 @@ for.end:
 
 define void @sin_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @sinf(float [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -163,42 +94,8 @@ for.end:
 
 define void @sin_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -220,42 +117,8 @@ for.end:
 
 define void @sin_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -277,42 +140,8 @@ for.end:
 
 define void @cos_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -334,42 +163,8 @@ for.end:
 
 define void @cos_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @cosf(float [[CONV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -391,42 +186,8 @@ for.end:
 
 define void @cos_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -448,42 +209,8 @@ for.end:
 
 define void @cos_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -505,59 +232,8 @@ for.end:
 
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast double* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast double* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 4, !alias.scope !18
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP1]], <4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP5]], <4 x double>* [[TMP8]], align 4, !alias.scope !21, !noalias !18
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @pow(double [[CONV]], double [[TMP1]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[TMP2]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -581,59 +257,8 @@ for.end:
 
 define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast double* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast double* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 4, !alias.scope !25
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP1]], <4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP5]], <4 x double>* [[TMP8]], align 4, !alias.scope !28, !noalias !25
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.pow.f64(double [[CONV]], double [[TMP1]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[TMP2]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -657,59 +282,8 @@ for.end:
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !32
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP1]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP8]], align 4, !alias.scope !35, !noalias !32
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @powf(float [[CONV]], float [[TMP1]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -733,59 +307,8 @@ for.end:
 
 define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT:    [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !39
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP1]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP8]], align 4, !alias.scope !42, !noalias !39
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.pow.f32(float [[CONV]], float [[TMP1]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -809,42 +332,8 @@ for.end:
 
 define void @exp_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @exp_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @exp(double [[CONV]]) #[[ATTR15:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -866,42 +355,8 @@ for.end:
 
 define void @exp_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float [[CONV]]) #[[ATTR16:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -923,42 +378,8 @@ for.end:
 
 define void @exp_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @exp_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV]]) #[[ATTR17:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -980,42 +401,8 @@ for.end:
 
 define void @exp_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @exp_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV]]) #[[ATTR18:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1037,42 +424,8 @@ for.end:
 
 define void @log_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @log_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @log(double [[CONV]]) #[[ATTR19:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1094,42 +447,8 @@ for.end:
 
 define void @log_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @log_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @logf(float [[CONV]]) #[[ATTR20:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1151,42 +470,8 @@ for.end:
 
 define void @log_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @log_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV]]) #[[ATTR21:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP59:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1208,42 +493,8 @@ for.end:
 
 define void @log_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @log_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP60:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV]]) #[[ATTR22:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP61:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1265,42 +516,8 @@ for.end:
 
 define void @log2_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @log2_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP62:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @log2(double [[CONV]]) #[[ATTR23:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP63:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1322,42 +539,8 @@ for.end:
 
 define void @log2_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @log2_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @log2f(float [[CONV]]) #[[ATTR24:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1379,42 +562,8 @@ for.end:
 
 define void @log2_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @log2_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV]]) #[[ATTR25:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1436,42 +585,8 @@ for.end:
 
 define void @log2_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @log2_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV]]) #[[ATTR26:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP69:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1493,42 +608,8 @@ for.end:
 
 define void @log10_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @log10_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP70:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @log10(double [[CONV]]) #[[ATTR27:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP71:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1550,42 +631,8 @@ for.end:
 
 define void @log10_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @log10_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP72:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @log10f(float [[CONV]]) #[[ATTR28:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP73:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1607,42 +654,8 @@ for.end:
 
 define void @log10_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @log10_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV]]) #[[ATTR29:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP75:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1664,42 +677,8 @@ for.end:
 
 define void @log10_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @log10_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP76:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV]]) #[[ATTR30:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP77:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1721,42 +700,8 @@ for.end:
 
 define void @sqrt_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sqrt_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP78:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double [[CONV]]) #[[ATTR31:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP79:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1778,42 +723,8 @@ for.end:
 
 define void @sqrt_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sqrt_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP80:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @sqrtf(float [[CONV]]) #[[ATTR32:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP81:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1835,42 +746,8 @@ for.end:
 
 define void @exp2_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP82:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @exp2(double [[CONV]]) #[[ATTR33:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP83:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1892,42 +769,8 @@ for.end:
 
 define void @exp2_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP84:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @exp2f(float [[CONV]]) #[[ATTR34:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP85:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -1949,42 +792,8 @@ for.end:
 
 define void @exp2_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f64_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP86:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV]]) #[[ATTR35:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP87:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body
@@ -2006,42 +815,8 @@ for.end:
 
 define void @exp2_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f32_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP88:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV]]) #[[ATTR36:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT:    store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP89:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index cf75dc8dfeeab..125eec7cc7590 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -16,8 +16,8 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
@@ -33,7 +33,7 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -52,7 +52,7 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal
 ; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ;
 entry:
   br label %for.body
@@ -87,8 +87,8 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
@@ -104,7 +104,7 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -123,7 +123,7 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
 ; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ;
 entry:
   br label %for.body
@@ -176,8 +176,8 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
@@ -191,7 +191,7 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -211,7 +211,7 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]]
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_1_LCSSA]]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index a0bba626c3387..6a43fa86057b2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
@@ -10,75 +9,6 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
 ; CHECK: cost of 10 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
 define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
-; CHECK-LABEL: @uint64_to_double_cost(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 8
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = uitofp <4 x i64> [[WIDE_LOAD]] to <4 x double>
-; CHECK-NEXT:    [[TMP17:%.*]] = uitofp <4 x i64> [[WIDE_LOAD1]] to <4 x double>
-; CHECK-NEXT:    [[TMP18:%.*]] = uitofp <4 x i64> [[WIDE_LOAD2]] to <4 x double>
-; CHECK-NEXT:    [[TMP19:%.*]] = uitofp <4 x i64> [[WIDE_LOAD3]] to <4 x double>
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP16]], <4 x double>* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 4
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP17]], <4 x double>* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP18]], <4 x double>* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 12
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP19]], <4 x double>* [[TMP31]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i64 [[TMP]] to double
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[CONV]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 for.body:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index e084950aa510b..5eb5840593238 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -16,13 +16,13 @@ define i32 @uniform_load(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -36,7 +36,7 @@ define i32 @uniform_load(i32* align(4) %addr) {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[LOAD_LCSSA]]
 ;
 entry:
@@ -61,39 +61,39 @@ define i32 @uniform_load2(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -104,7 +104,7 @@ define i32 @uniform_load2(i32* align(4) %addr) {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -131,21 +131,21 @@ define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -161,7 +161,7 @@ define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[LOAD_LCSSA]]
 ;
 entry:
@@ -207,8 +207,8 @@ define void @uniform_store_uniform_value(i32* align(4) %addr) {
 ; CHECK-NEXT:    store i32 0, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    store i32 0, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -246,46 +246,46 @@ define void @uniform_store_varying_value(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i32> [[STEP_ADD1]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 0
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 1
+; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD5:%.*]] = add <4 x i32> [[VEC_IND4]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[STEP_ADD6:%.*]] = add <4 x i32> [[STEP_ADD5]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[STEP_ADD7:%.*]] = add <4 x i32> [[STEP_ADD6]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 0
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP5]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP6]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP7]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP8]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP9]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP10]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP11]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP12]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP13]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP14]], i32* [[ADDR]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP15]], i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 0
+; CHECK-NEXT:    store i32 [[TMP16]], i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 1
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 2
+; CHECK-NEXT:    store i32 [[TMP18]], i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 3
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[ADDR]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i32> [[STEP_ADD7]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -367,29 +367,29 @@ define void @uniform_copy(i32* %A, i32* %B) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -561,39 +561,39 @@ define i32 @uniform_load_global() {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -604,7 +604,7 @@ define i32 @uniform_load_global() {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -632,39 +632,39 @@ define i32 @uniform_load_constexpr() {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4097, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -675,7 +675,7 @@ define i32 @uniform_load_constexpr() {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
index 8d7451c54bfc3..8d5500dbd58a8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -1,48 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK: 'foo'
 ; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
 define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[BODY:%.*]]
-; CHECK:       body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXT:%.*]], [[BODY]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 [[I]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR]], align 4
-; CHECK-NEXT:    [[SHIFT:%.*]] = ashr i32 [[VAL]], [[K]]
-; CHECK-NEXT:    store i32 [[SHIFT]], i32* [[PTR]], align 4
-; CHECK-NEXT:    [[NEXT]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[NEXT]], 16
-; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT]], label [[BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index a2b74bdd07307..3e7341f5ad42d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -38,7 +38,7 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -55,7 +55,7 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -97,8 +97,8 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
@@ -112,7 +112,7 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -128,7 +128,7 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -180,7 +180,7 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -197,7 +197,7 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
index 59c444b858be5..2d2082eec3aa7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1
 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2
 ; REQUIRES: asserts
@@ -21,224 +20,6 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-AVX1: LV: Selecting VF: 16.
 ; CHECK-AVX2: LV: Selecting VF: 32.
 define void @foo() {
-; CHECK-AVX1-LABEL: @foo(
-; CHECK-AVX1-NEXT:  iter.check:
-; CHECK-AVX1-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX1:       vector.main.loop.iter.check:
-; CHECK-AVX1-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX1:       vector.ph:
-; CHECK-AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX1:       vector.body:
-; CHECK-AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; CHECK-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1
-; CHECK-AVX1-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; CHECK-AVX1-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 1
-; CHECK-AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
-; CHECK-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <16 x i32>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP16]], align 4
-; CHECK-AVX1-NEXT:    [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
-; CHECK-AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <16 x i32>*
-; CHECK-AVX1-NEXT:    store <16 x i32> [[TMP17]], <16 x i32>* [[TMP20]], align 4
-; CHECK-AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
-; CHECK-AVX1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-AVX1:       middle.block:
-; CHECK-AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 992
-; CHECK-AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX1:       vec.epilog.iter.check:
-; CHECK-AVX1-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX1:       vec.epilog.ph:
-; CHECK-AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX1-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX1:       vec.epilog.vector.body:
-; CHECK-AVX1-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP23]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <8 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 1
-; CHECK-AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP26]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <8 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP28]], align 1
-; CHECK-AVX1-NEXT:    [[TMP29:%.*]] = add <8 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
-; CHECK-AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP30]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP32:%.*]] = bitcast i8* [[TMP31]] to <8 x i8>*
-; CHECK-AVX1-NEXT:    store <8 x i8> [[TMP29]], <8 x i8>* [[TMP32]], align 1
-; CHECK-AVX1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4
-; CHECK-AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP38:%.*]] = bitcast i32* [[TMP37]] to <8 x i32>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x i32>, <8 x i32>* [[TMP38]], align 4
-; CHECK-AVX1-NEXT:    [[TMP39:%.*]] = add nsw <8 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD8]]
-; CHECK-AVX1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[TMP40]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <8 x i32>*
-; CHECK-AVX1-NEXT:    store <8 x i32> [[TMP39]], <8 x i32>* [[TMP42]], align 4
-; CHECK-AVX1-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-AVX1-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT10]], 1000
-; CHECK-AVX1-NEXT:    br i1 [[TMP43]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-AVX1:       vec.epilog.middle.block:
-; CHECK-AVX1-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 1000, 1000
-; CHECK-AVX1-NEXT:    br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX1:       vec.epilog.scalar.ph:
-; CHECK-AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-AVX1:       for.cond.cleanup.loopexit:
-; CHECK-AVX1-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK-AVX1:       for.cond.cleanup:
-; CHECK-AVX1-NEXT:    ret void
-; CHECK-AVX1:       for.body:
-; CHECK-AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    [[TMP44:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    [[TMP45:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-AVX1-NEXT:    [[ADD:%.*]] = add i8 [[TMP45]], [[TMP44]]
-; CHECK-AVX1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX6]], align 1
-; CHECK-AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
-; CHECK-AVX1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
-; CHECK-AVX1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP47]], [[TMP46]]
-; CHECK-AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4
-; CHECK-AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; CHECK-AVX2-LABEL: @foo(
-; CHECK-AVX2-NEXT:  iter.check:
-; CHECK-AVX2-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX2:       vector.main.loop.iter.check:
-; CHECK-AVX2-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX2:       vector.ph:
-; CHECK-AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX2:       vector.body:
-; CHECK-AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP3]], align 1
-; CHECK-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <32 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, <32 x i8>* [[TMP6]], align 1
-; CHECK-AVX2-NEXT:    [[TMP7:%.*]] = add <32 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <32 x i8>*
-; CHECK-AVX2-NEXT:    store <32 x i8> [[TMP7]], <32 x i8>* [[TMP10]], align 1
-; CHECK-AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <32 x i32>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <32 x i32>, <32 x i32>* [[TMP13]], align 4
-; CHECK-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <32 x i32>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <32 x i32>, <32 x i32>* [[TMP16]], align 4
-; CHECK-AVX2-NEXT:    [[TMP17:%.*]] = add nsw <32 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
-; CHECK-AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <32 x i32>*
-; CHECK-AVX2-NEXT:    store <32 x i32> [[TMP17]], <32 x i32>* [[TMP20]], align 4
-; CHECK-AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
-; CHECK-AVX2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-AVX2:       middle.block:
-; CHECK-AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 992
-; CHECK-AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX2:       vec.epilog.iter.check:
-; CHECK-AVX2-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX2:       vec.epilog.ph:
-; CHECK-AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX2-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX2:       vec.epilog.vector.body:
-; CHECK-AVX2-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP23]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <16 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP25]], align 1
-; CHECK-AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP26]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <16 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP28]], align 1
-; CHECK-AVX2-NEXT:    [[TMP29:%.*]] = add <16 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
-; CHECK-AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP30]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP32:%.*]] = bitcast i8* [[TMP31]] to <16 x i8>*
-; CHECK-AVX2-NEXT:    store <16 x i8> [[TMP29]], <16 x i8>* [[TMP32]], align 1
-; CHECK-AVX2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4
-; CHECK-AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP38:%.*]] = bitcast i32* [[TMP37]] to <16 x i32>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i32>, <16 x i32>* [[TMP38]], align 4
-; CHECK-AVX2-NEXT:    [[TMP39:%.*]] = add nsw <16 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD8]]
-; CHECK-AVX2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[TMP40]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <16 x i32>*
-; CHECK-AVX2-NEXT:    store <16 x i32> [[TMP39]], <16 x i32>* [[TMP42]], align 4
-; CHECK-AVX2-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 16
-; CHECK-AVX2-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT10]], 992
-; CHECK-AVX2-NEXT:    br i1 [[TMP43]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-AVX2:       vec.epilog.middle.block:
-; CHECK-AVX2-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 1000, 992
-; CHECK-AVX2-NEXT:    br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX2:       vec.epilog.scalar.ph:
-; CHECK-AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX2-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-AVX2:       for.cond.cleanup.loopexit:
-; CHECK-AVX2-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK-AVX2:       for.cond.cleanup:
-; CHECK-AVX2-NEXT:    ret void
-; CHECK-AVX2:       for.body:
-; CHECK-AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    [[TMP44:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    [[TMP45:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-AVX2-NEXT:    [[ADD:%.*]] = add i8 [[TMP45]], [[TMP44]]
-; CHECK-AVX2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX6]], align 1
-; CHECK-AVX2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
-; CHECK-AVX2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
-; CHECK-AVX2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP47]], [[TMP46]]
-; CHECK-AVX2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4
-; CHECK-AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
 entry:
   br label %for.body
 
@@ -270,148 +51,7 @@ for.body:
 ; VF chosen should be atmost 16 (not the max possible vector width = 32 for AVX2)
 define void @not_too_small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
 ; CHECK-LABEL: not_too_small_tc
-; CHECK-AVX1-LABEL: @not_too_small_tc(
-; CHECK-AVX1-NEXT:  iter.check:
-; CHECK-AVX1-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX1:       vector.main.loop.iter.check:
-; CHECK-AVX1-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX1:       vector.ph:
-; CHECK-AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX1:       vector.body:
-; CHECK-AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-AVX1-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX1-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-AVX1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-AVX1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-AVX1:       middle.block:
-; CHECK-AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX1:       vec.epilog.iter.check:
-; CHECK-AVX1-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX1:       vec.epilog.ph:
-; CHECK-AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX1-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX1:       vec.epilog.vector.body:
-; CHECK-AVX1-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP10]]
-; CHECK-AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP11]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP10]]
-; CHECK-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
-; CHECK-AVX1-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX1-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP16]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[TMP17:%.*]] = add <8 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; CHECK-AVX1-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX1-NEXT:    store <8 x i8> [[TMP17]], <8 x i8>* [[TMP18]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-AVX1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 16
-; CHECK-AVX1-NEXT:    br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-AVX1:       vec.epilog.middle.block:
-; CHECK-AVX1-NEXT:    [[CMP_N2:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX1-NEXT:    br i1 [[CMP_N2]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX1:       vec.epilog.scalar.ph:
-; CHECK-AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-AVX1:       for.body:
-; CHECK-AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    [[L1:%.*]] = load i8, i8* [[ARRAYIDX]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT:    [[L2:%.*]] = load i8, i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[ADD:%.*]] = add i8 [[L1]], [[L2]]
-; CHECK-AVX1-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-AVX1:       for.end.loopexit:
-; CHECK-AVX1-NEXT:    br label [[FOR_END]]
-; CHECK-AVX1:       for.end:
-; CHECK-AVX1-NEXT:    ret void
-;
-; CHECK-AVX2-LABEL: @not_too_small_tc(
-; CHECK-AVX2-NEXT:  iter.check:
-; CHECK-AVX2-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX2:       vector.main.loop.iter.check:
-; CHECK-AVX2-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX2:       vector.ph:
-; CHECK-AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX2:       vector.body:
-; CHECK-AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-AVX2-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX2-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-AVX2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-AVX2:       middle.block:
-; CHECK-AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX2:       vec.epilog.iter.check:
-; CHECK-AVX2-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX2:       vec.epilog.ph:
-; CHECK-AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX2-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX2:       vec.epilog.vector.body:
-; CHECK-AVX2-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP10]]
-; CHECK-AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP11]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP10]]
-; CHECK-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
-; CHECK-AVX2-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP16]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[TMP17:%.*]] = add <8 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; CHECK-AVX2-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX2-NEXT:    store <8 x i8> [[TMP17]], <8 x i8>* [[TMP18]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-AVX2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 16
-; CHECK-AVX2-NEXT:    br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-AVX2:       vec.epilog.middle.block:
-; CHECK-AVX2-NEXT:    [[CMP_N2:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX2-NEXT:    br i1 [[CMP_N2]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX2:       vec.epilog.scalar.ph:
-; CHECK-AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX2-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-AVX2:       for.body:
-; CHECK-AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    [[L1:%.*]] = load i8, i8* [[ARRAYIDX]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT:    [[L2:%.*]] = load i8, i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[ADD:%.*]] = add i8 [[L1]], [[L2]]
-; CHECK-AVX2-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-AVX2:       for.end.loopexit:
-; CHECK-AVX2-NEXT:    br label [[FOR_END]]
-; CHECK-AVX2:       for.end:
-; CHECK-AVX2-NEXT:    ret void
-;
+; CHECK-AVX2: LV: Selecting VF: 16.
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
index bfc7ccc08100c..c38b638ed2b82 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
 ; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
 ; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
@@ -14,173 +13,6 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @foo(i32 %n) #0 !dbg !4 {
-; VECTORIZED-LABEL: @foo(
-; VECTORIZED-NEXT:  entry:
-; VECTORIZED-NEXT:    [[DIFF:%.*]] = alloca i32, align 4
-; VECTORIZED-NEXT:    [[CB:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT:    [[CC:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT:    store i32 0, i32* [[DIFF]], align 4, !tbaa [[TBAA8:![0-9]+]]
-; VECTORIZED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; VECTORIZED:       vector.ph:
-; VECTORIZED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; VECTORIZED:       vector.body:
-; VECTORIZED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VECTORIZED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; VECTORIZED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VECTORIZED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[TMP0]]
-; VECTORIZED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; VECTORIZED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; VECTORIZED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1, !tbaa [[TBAA12:![0-9]+]]
-; VECTORIZED-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; VECTORIZED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[TMP0]]
-; VECTORIZED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; VECTORIZED-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; VECTORIZED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1, !tbaa [[TBAA12]]
-; VECTORIZED-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
-; VECTORIZED-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
-; VECTORIZED-NEXT:    [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]]
-; VECTORIZED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VECTORIZED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VECTORIZED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; VECTORIZED:       middle.block:
-; VECTORIZED-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
-; VECTORIZED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
-; VECTORIZED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; VECTORIZED:       scalar.ph:
-; VECTORIZED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VECTORIZED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]]
-; VECTORIZED:       for.body:
-; VECTORIZED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VECTORIZED-NEXT:    [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]]
-; VECTORIZED-NEXT:    [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !tbaa [[TBAA12]]
-; VECTORIZED-NEXT:    [[CONV:%.*]] = sext i8 [[TMP13]] to i32
-; VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]]
-; VECTORIZED-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !tbaa [[TBAA12]]
-; VECTORIZED-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP14]] to i32
-; VECTORIZED-NEXT:    [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]]
-; VECTORIZED-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[ADD8]]
-; VECTORIZED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; VECTORIZED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; VECTORIZED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; VECTORIZED:       for.end:
-; VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; VECTORIZED-NEXT:    store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !tbaa [[TBAA8]]
-; VECTORIZED-NEXT:    call void @ibar(i32* [[DIFF]])
-; VECTORIZED-NEXT:    ret i32 0
-;
-; UNROLLED-LABEL: @foo(
-; UNROLLED-NEXT:  entry:
-; UNROLLED-NEXT:    [[DIFF:%.*]] = alloca i32, align 4
-; UNROLLED-NEXT:    [[CB:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT:    [[CC:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT:    store i32 0, i32* [[DIFF]], align 4, !tbaa [[TBAA8:![0-9]+]]
-; UNROLLED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLLED:       vector.ph:
-; UNROLLED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLLED:       vector.body:
-; UNROLLED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLLED-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; UNROLLED-NEXT:    [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; UNROLLED-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; UNROLLED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION]]
-; UNROLLED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION4]]
-; UNROLLED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION5]]
-; UNROLLED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION6]]
-; UNROLLED-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP0]], align 1, !tbaa [[TBAA12:![0-9]+]]
-; UNROLLED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP2]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP3]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP8:%.*]] = sext i8 [[TMP4]] to i32
-; UNROLLED-NEXT:    [[TMP9:%.*]] = sext i8 [[TMP5]] to i32
-; UNROLLED-NEXT:    [[TMP10:%.*]] = sext i8 [[TMP6]] to i32
-; UNROLLED-NEXT:    [[TMP11:%.*]] = sext i8 [[TMP7]] to i32
-; UNROLLED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION]]
-; UNROLLED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION4]]
-; UNROLLED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION5]]
-; UNROLLED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION6]]
-; UNROLLED-NEXT:    [[TMP16:%.*]] = load i8, i8* [[TMP12]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP17:%.*]] = load i8, i8* [[TMP13]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP14]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP15]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[TMP20:%.*]] = sext i8 [[TMP16]] to i32
-; UNROLLED-NEXT:    [[TMP21:%.*]] = sext i8 [[TMP17]] to i32
-; UNROLLED-NEXT:    [[TMP22:%.*]] = sext i8 [[TMP18]] to i32
-; UNROLLED-NEXT:    [[TMP23:%.*]] = sext i8 [[TMP19]] to i32
-; UNROLLED-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP8]], [[TMP20]]
-; UNROLLED-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP9]], [[TMP21]]
-; UNROLLED-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP10]], [[TMP22]]
-; UNROLLED-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP11]], [[TMP23]]
-; UNROLLED-NEXT:    [[TMP28]] = add i32 [[TMP24]], [[VEC_PHI]]
-; UNROLLED-NEXT:    [[TMP29]] = add i32 [[TMP25]], [[VEC_PHI1]]
-; UNROLLED-NEXT:    [[TMP30]] = add i32 [[TMP26]], [[VEC_PHI2]]
-; UNROLLED-NEXT:    [[TMP31]] = add i32 [[TMP27]], [[VEC_PHI3]]
-; UNROLLED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLLED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; UNROLLED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; UNROLLED:       middle.block:
-; UNROLLED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP29]], [[TMP28]]
-; UNROLLED-NEXT:    [[BIN_RDX7:%.*]] = add i32 [[TMP30]], [[BIN_RDX]]
-; UNROLLED-NEXT:    [[BIN_RDX8:%.*]] = add i32 [[TMP31]], [[BIN_RDX7]]
-; UNROLLED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
-; UNROLLED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLLED:       scalar.ph:
-; UNROLLED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLLED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; UNROLLED-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLLED:       for.body:
-; UNROLLED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; UNROLLED-NEXT:    [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; UNROLLED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]]
-; UNROLLED-NEXT:    [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[CONV:%.*]] = sext i8 [[TMP33]] to i32
-; UNROLLED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]]
-; UNROLLED-NEXT:    [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP34]] to i32
-; UNROLLED-NEXT:    [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]]
-; UNROLLED-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[ADD8]]
-; UNROLLED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; UNROLLED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; UNROLLED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; UNROLLED:       for.end:
-; UNROLLED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; UNROLLED-NEXT:    store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !tbaa [[TBAA8]]
-; UNROLLED-NEXT:    call void @ibar(i32* [[DIFF]])
-; UNROLLED-NEXT:    ret i32 0
-;
-; NONE-LABEL: @foo(
-; NONE-NEXT:  entry:
-; NONE-NEXT:    [[DIFF:%.*]] = alloca i32, align 4
-; NONE-NEXT:    [[CB:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT:    [[CC:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT:    store i32 0, i32* [[DIFF]], align 4, !tbaa [[TBAA8:![0-9]+]]
-; NONE-NEXT:    br label [[FOR_BODY:%.*]]
-; NONE:       for.body:
-; NONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NONE-NEXT:    [[ADD8:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; NONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]]
-; NONE-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !tbaa [[TBAA12:![0-9]+]]
-; NONE-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
-; NONE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]]
-; NONE-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !tbaa [[TBAA12]]
-; NONE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
-; NONE-NEXT:    [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]]
-; NONE-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[ADD8]]
-; NONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; NONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; NONE:       for.end:
-; NONE-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; NONE-NEXT:    store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !tbaa [[TBAA8]]
-; NONE-NEXT:    call void @ibar(i32* [[DIFF]])
-; NONE-NEXT:    ret i32 0
-;
 entry:
   %
diff  = alloca i32, align 4
   %cb = alloca [16 x i8], align 16

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
index 318a860c253a6..e16f8cf4ee0c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
 
 ; Verify analysis remarks are generated when interleaving is not beneficial.
@@ -25,30 +24,6 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 ; Function Attrs: nounwind uwtable
 define void @do_not_interleave(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !4 {
-; CHECK-LABEL: @do_not_interleave(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_4:%.*]] = icmp eq i32 [[SIZE:%.*]], 0, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP_4]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]], !dbg [[DBG9:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG10:![0-9]+]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float*, float** [[IN:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG10]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float** [[ARRAYIDX]] to i32**, !dbg [[DBG10]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 8, !dbg [[DBG10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4, !dbg [[DBG11:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to i32*, !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP3]], align 4, !dbg [[DBG13]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG9]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG9]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]], !dbg [[DBG9]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG9]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]], !dbg [[DBG17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void, !dbg [[DBG17]]
-;
 entry:
   %cmp.4 = icmp eq i32 %size, 0, !dbg !10
   br i1 %cmp.4, label %for.end, label %for.body.preheader, !dbg !11
@@ -79,30 +54,6 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; Function Attrs: nounwind uwtable
 define void @interleave_not_profitable(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !6 {
-; CHECK-LABEL: @interleave_not_profitable(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_4:%.*]] = icmp eq i32 [[SIZE:%.*]], 0, !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP_4]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]], !dbg [[DBG20:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG20]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float*, float** [[IN:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float** [[ARRAYIDX]] to i32**, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 8, !dbg [[DBG21]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4, !dbg [[DBG22:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to i32*, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP3]], align 4, !dbg [[DBG24]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG20]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG20]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]], !dbg [[DBG20]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG20]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]], !dbg [[DBG26:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void, !dbg [[DBG26]]
-;
 entry:
   %cmp.4 = icmp eq i32 %size, 0, !dbg !20
   br i1 %cmp.4, label %for.end, label %for.body, !dbg !21

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index 3138d6f6743c3..0345a3ba04577 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
 ; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
 ; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
@@ -14,173 +13,6 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @foo(i32 %n) #0 !dbg !4 {
-; VECTORIZED-LABEL: @foo(
-; VECTORIZED-NEXT:  entry:
-; VECTORIZED-NEXT:    [[DIFF:%.*]] = alloca i32, align 4
-; VECTORIZED-NEXT:    [[CB:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT:    [[CC:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT:    store i32 0, i32* [[DIFF]], align 4, !dbg [[DBG8:![0-9]+]], !tbaa [[TBAA9:![0-9]+]]
-; VECTORIZED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG13:![0-9]+]]
-; VECTORIZED:       vector.ph:
-; VECTORIZED-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG13]]
-; VECTORIZED:       vector.body:
-; VECTORIZED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG13]]
-; VECTORIZED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; VECTORIZED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VECTORIZED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[TMP0]], !dbg [[DBG17:![0-9]+]]
-; VECTORIZED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19:![0-9]+]]
-; VECTORIZED-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[TMP0]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; VECTORIZED-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG13]]
-; VECTORIZED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16, !dbg [[DBG13]]
-; VECTORIZED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP20:![0-9]+]]
-; VECTORIZED:       middle.block:
-; VECTORIZED-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]), !dbg [[DBG13]]
-; VECTORIZED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16, !dbg [[DBG13]]
-; VECTORIZED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]], !dbg [[DBG13]]
-; VECTORIZED:       scalar.ph:
-; VECTORIZED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VECTORIZED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; VECTORIZED-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG13]]
-; VECTORIZED:       for.body:
-; VECTORIZED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VECTORIZED-NEXT:    [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; VECTORIZED-NEXT:    [[CONV:%.*]] = sext i8 [[TMP13]] to i32, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; VECTORIZED-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP14]] to i32, !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[ADD8]], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG13]]
-; VECTORIZED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16, !dbg [[DBG13]]
-; VECTORIZED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP22:![0-9]+]]
-; VECTORIZED:       for.end:
-; VECTORIZED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], !dbg [[DBG17]]
-; VECTORIZED-NEXT:    store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
-; VECTORIZED-NEXT:    call void @ibar(i32* [[DIFF]]), !dbg [[DBG24:![0-9]+]]
-; VECTORIZED-NEXT:    ret i32 0, !dbg [[DBG25:![0-9]+]]
-;
-; UNROLLED-LABEL: @foo(
-; UNROLLED-NEXT:  entry:
-; UNROLLED-NEXT:    [[DIFF:%.*]] = alloca i32, align 4
-; UNROLLED-NEXT:    [[CB:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT:    [[CC:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT:    store i32 0, i32* [[DIFF]], align 4, !dbg [[DBG8:![0-9]+]], !tbaa [[TBAA9:![0-9]+]]
-; UNROLLED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG13:![0-9]+]]
-; UNROLLED:       vector.ph:
-; UNROLLED-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG13]]
-; UNROLLED:       vector.body:
-; UNROLLED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG13]]
-; UNROLLED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLLED-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; UNROLLED-NEXT:    [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; UNROLLED-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; UNROLLED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION]], !dbg [[DBG17:![0-9]+]]
-; UNROLLED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION4]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION5]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION6]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP0]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19:![0-9]+]]
-; UNROLLED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP3]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP8:%.*]] = sext i8 [[TMP4]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP9:%.*]] = sext i8 [[TMP5]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP10:%.*]] = sext i8 [[TMP6]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP11:%.*]] = sext i8 [[TMP7]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION4]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION5]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION6]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP16:%.*]] = load i8, i8* [[TMP12]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP17:%.*]] = load i8, i8* [[TMP13]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP14]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP15]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[TMP20:%.*]] = sext i8 [[TMP16]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP21:%.*]] = sext i8 [[TMP17]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP22:%.*]] = sext i8 [[TMP18]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP23:%.*]] = sext i8 [[TMP19]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP8]], [[TMP20]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP9]], [[TMP21]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP10]], [[TMP22]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP11]], [[TMP23]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP28]] = add i32 [[TMP24]], [[VEC_PHI]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP29]] = add i32 [[TMP25]], [[VEC_PHI1]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP30]] = add i32 [[TMP26]], [[VEC_PHI2]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP31]] = add i32 [[TMP27]], [[VEC_PHI3]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG13]]
-; UNROLLED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16, !dbg [[DBG13]]
-; UNROLLED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP20:![0-9]+]]
-; UNROLLED:       middle.block:
-; UNROLLED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP29]], [[TMP28]], !dbg [[DBG13]]
-; UNROLLED-NEXT:    [[BIN_RDX7:%.*]] = add i32 [[TMP30]], [[BIN_RDX]], !dbg [[DBG13]]
-; UNROLLED-NEXT:    [[BIN_RDX8:%.*]] = add i32 [[TMP31]], [[BIN_RDX7]], !dbg [[DBG13]]
-; UNROLLED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16, !dbg [[DBG13]]
-; UNROLLED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]], !dbg [[DBG13]]
-; UNROLLED:       scalar.ph:
-; UNROLLED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLLED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; UNROLLED-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG13]]
-; UNROLLED:       for.body:
-; UNROLLED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; UNROLLED-NEXT:    [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[CONV:%.*]] = sext i8 [[TMP33]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP34]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[ADD8]], !dbg [[DBG17]]
-; UNROLLED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG13]]
-; UNROLLED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16, !dbg [[DBG13]]
-; UNROLLED-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP22:![0-9]+]]
-; UNROLLED:       for.end:
-; UNROLLED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ], !dbg [[DBG17]]
-; UNROLLED-NEXT:    store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
-; UNROLLED-NEXT:    call void @ibar(i32* [[DIFF]]), !dbg [[DBG23:![0-9]+]]
-; UNROLLED-NEXT:    ret i32 0, !dbg [[DBG24:![0-9]+]]
-;
-; NONE-LABEL: @foo(
-; NONE-NEXT:  entry:
-; NONE-NEXT:    [[DIFF:%.*]] = alloca i32, align 4
-; NONE-NEXT:    [[CB:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT:    [[CC:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT:    store i32 0, i32* [[DIFF]], align 4, !dbg [[DBG8:![0-9]+]], !tbaa [[TBAA9:![0-9]+]]
-; NONE-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG13:![0-9]+]]
-; NONE:       for.body:
-; NONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NONE-NEXT:    [[ADD8:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], !dbg [[DBG17:![0-9]+]]
-; NONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; NONE-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19:![0-9]+]]
-; NONE-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32, !dbg [[DBG17]]
-; NONE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; NONE-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; NONE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32, !dbg [[DBG17]]
-; NONE-NEXT:    [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]], !dbg [[DBG17]]
-; NONE-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[ADD8]], !dbg [[DBG17]]
-; NONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG13]]
-; NONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16, !dbg [[DBG13]]
-; NONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !dbg [[DBG13]]
-; NONE:       for.end:
-; NONE-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], !dbg [[DBG17]]
-; NONE-NEXT:    store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
-; NONE-NEXT:    call void @ibar(i32* [[DIFF]]), !dbg [[DBG20:![0-9]+]]
-; NONE-NEXT:    ret i32 0, !dbg [[DBG21:![0-9]+]]
-;
 entry:
   %
diff  = alloca i32, align 4
   %cb = alloca [16 x i8], align 16

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
index f4233ee865b61..10b82d1469636 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
@@ -1,33 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 define i32 @accum(i32* nocapture readonly %x, i32 %N) #0 {
-; CHECK-LABEL: @accum(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_INC_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.inc.preheader:
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[FOR_INC_PREHEADER]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_INC]] ], [ 0, [[FOR_INC_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP0]], [[SUM_02]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
-;
 entry:
+; CHECK-LABEL: @accum
+; CHECK-NOT: x i32>
 
   %cmp1 = icmp sgt i32 %N, 0
   br i1 %cmp1, label %for.inc.preheader, label %for.end
@@ -54,6 +32,7 @@ for.end:
   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
   ret i32 %sum.0.lcssa
 
+; CHECK: ret i32
 }
 
 attributes #0 = { "target-cpu"="core2" "target-features"="+sse,-avx,-avx2,-sse2" }

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
index 7e04c63d02769..80c7eecdeeda3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -enable-interleaved-mem-accesses=true -force-vector-width=4 -loop-vectorize -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -8,78 +7,6 @@ target triple = "x86_64-apple-macosx10.7.0"
 ;
 ; CHECK-NOT: load {{.*}} x x86_fp80
 define x86_fp80 @foo(x86_fp80* %a) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x x86_fp80> [ <x86_fp80 undef, x86_fp80 0xK00000000000000000000, x86_fp80 0xK00000000000000000000, x86_fp80 0xK00000000000000000000>, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i16 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A:%.*]], i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load x86_fp80, x86_fp80* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, x86_fp80* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load x86_fp80, x86_fp80* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load x86_fp80, x86_fp80* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x x86_fp80> poison, x86_fp80 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x x86_fp80> [[TMP13]], x86_fp80 [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x x86_fp80> [[TMP14]], x86_fp80 [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x x86_fp80> [[TMP15]], x86_fp80 [[TMP12]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = or i16 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = or i16 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = or i16 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = or i16 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load x86_fp80, x86_fp80* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP26:%.*]] = load x86_fp80, x86_fp80* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load x86_fp80, x86_fp80* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = load x86_fp80, x86_fp80* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x x86_fp80> poison, x86_fp80 [[TMP25]], i32 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x x86_fp80> [[TMP29]], x86_fp80 [[TMP26]], i32 1
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x x86_fp80> [[TMP30]], x86_fp80 [[TMP27]], i32 2
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x x86_fp80> [[TMP31]], x86_fp80 [[TMP28]], i32 3
-; CHECK-NEXT:    [[TMP33:%.*]] = fadd fast <4 x x86_fp80> [[TMP16]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP34]] = fadd fast <4 x x86_fp80> [[TMP33]], [[TMP32]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP36:%.*]] = call fast x86_fp80 @llvm.vector.reduce.fadd.v4f80(x86_fp80 0xK80000000000000000000, <4 x x86_fp80> [[TMP34]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 200, 200
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 400, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi x86_fp80 [ undef, [[ENTRY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi x86_fp80 [ [[TMP40:%.*]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret x86_fp80 [[DOTLCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_09:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[RES_08:%.*]] = phi x86_fp80 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP40]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[I_09]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load x86_fp80, x86_fp80* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = or i16 [[I_09]], 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[ADD]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load x86_fp80, x86_fp80* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[TMP39:%.*]] = fadd fast x86_fp80 [[TMP37]], [[RES_08]]
-; CHECK-NEXT:    [[TMP40]] = fadd fast x86_fp80 [[TMP39]], [[TMP38]]
-; CHECK-NEXT:    [[ADD3]] = add nuw nsw i16 [[I_09]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[ADD3]], 400
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP2:![0-9]+]]
-;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/alias-set-with-uncomputable-bounds.ll b/llvm/test/Transforms/LoopVectorize/alias-set-with-uncomputable-bounds.ll
index 4aa672c71bf96..5f48283f52a43 100644
--- a/llvm/test/Transforms/LoopVectorize/alias-set-with-uncomputable-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/alias-set-with-uncomputable-bounds.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt  -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 ; Tests with alias sets that contain points with uncomputable bounds because
@@ -7,82 +6,9 @@
 ; Alias set with uncomputable bounds contains a single load. We do not need
 ; runtime checks for that group and it should not block vectorization.
 define void @test1_uncomputable_bounds_single_load(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test1_uncomputable_bounds_single_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR_11:%.*]] = bitcast i32* [[PTR_1:%.*]] to i8*
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK:       ph:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[X:%.*]]
-; CHECK-NEXT:    [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8*
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[X]], [[N]]
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[TMP0]]
-; CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[PTR_11]], [[SCEVGEP56]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP12]], <2 x i32>* [[TMP15]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP12]], <2 x i32>* [[TMP19]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3]], i64 [[IV]]
-; CHECK-NEXT:    [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[OFFSET_1]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[LV]], i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X]]
-; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT:    store i32 [[LV]], i32* [[GEP_4]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: define void @test1_uncomputable_bounds_single_load
+; CHECK:       vector.body
+; CHECK:         ret void
 
 entry:
   %cond = icmp sgt i64 %N, 0
@@ -116,80 +42,9 @@ exit:
 ; Alias set with uncomputable bounds contains a single store. We do not need
 ; runtime checks for that group and it should not block vectorization.
 define void @test2_uncomputable_bounds_single_store(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test2_uncomputable_bounds_single_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR_11:%.*]] = bitcast i32* [[PTR_1:%.*]] to i8*
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK:       ph:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[X:%.*]]
-; CHECK-NEXT:    [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8*
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[X]], [[N]]
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[TMP0]]
-; CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[PTR_11]], [[SCEVGEP56]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[TMP7]]
-; CHECK-NEXT:    store i32 20, i32* [[TMP6]], align 4
-; CHECK-NEXT:    store i32 20, i32* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, <2 x i32>* [[TMP11]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 10>, <2 x i32>* [[TMP15]], align 4, !alias.scope !11
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3]], i64 [[IV]]
-; CHECK-NEXT:    [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[OFFSET_1]]
-; CHECK-NEXT:    store i32 20, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[IV]]
-; CHECK-NEXT:    store i32 0, i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X]]
-; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT:    store i32 10, i32* [[GEP_4]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: define void @test2_uncomputable_bounds_single_store
+; CHECK:       vector.body
+; CHECK:         ret void
 
 entry:
   %cond = icmp sgt i64 %N, 0
@@ -223,33 +78,8 @@ exit:
 ; Alias set with uncomputable bounds contains a load and a store. This blocks
 ; vectorization, as we cannot generate runtime-checks for the set.
 define void @test3_uncomputable_bounds_load_store(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test3_uncomputable_bounds_load_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK:       ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[OFFSET_1]]
-; CHECK-NEXT:    store i32 20, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[GEP_22:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_22]], align 4
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[LV]], i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X:%.*]]
-; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT:    store i32 [[LV]], i32* [[GEP_4]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT:%.*]], label [[LOOP]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: define void @test3_uncomputable_bounds_load_store
+; CHECK-NOT: vector.body
 
 entry:
   %cond = icmp sgt i64 %N, 0
@@ -285,33 +115,8 @@ exit:
 ; Alias set with uncomputable bounds contains a load and a store. This blocks
 ; vectorization, as we cannot generate runtime-checks for the set.
 define void @test4_uncomputable_bounds_store_store(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test4_uncomputable_bounds_store_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK:       ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[OFFSET_1]]
-; CHECK-NEXT:    store i32 20, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[GEP_22:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i64 [[IV]]
-; CHECK-NEXT:    store i32 30, i32* [[GEP_22]], align 4
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i32 0, i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X:%.*]]
-; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT:    store i32 10, i32* [[GEP_4]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_EXIT:%.*]], label [[LOOP]]
-; CHECK:       loop.exit:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: define void @test4_uncomputable_bounds_store_store
+; CHECK-NOT: vector.body
 
 entry:
   %cond = icmp sgt i64 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index bab8c39b1a6e9..b1cb79efa2245 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -1,68 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-width=2 -force-vector-interleave=2  -S | FileCheck %s
 
 define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], <float 1.000000e+02, float 1.000000e+02>
-; CHECK-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], <float 1.000000e+02, float 1.000000e+02>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP14]], <2 x float>* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP15]], <2 x float>* [[TMP21]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP23]], 1.000000e+02
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP1]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* {{.*}}, align 4
+; CHECK:         [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* {{.*}}, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], <float 1.000000e+02, float 1.000000e+02>
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], <float 1.000000e+02, float 1.000000e+02>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
 entry:
   br label %for.body
 
@@ -91,81 +43,15 @@ attributes #0 = { nounwind willreturn }
 
 define void @test2(%struct.data* nocapture readonly %d) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], %struct.data* [[D:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float*, float** [[B]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to i8*
-; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[TMP0]] to i64
-; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_DATA]], %struct.data* [[D]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load float*, float** [[A]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to i8*
-; CHECK-NEXT:    [[PTRINT2:%.*]] = ptrtoint float* [[TMP2]] to i64
-; CHECK-NEXT:    [[MASKEDPTR3:%.*]] = and i64 [[PTRINT2]], 31
-; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[TMP3]], [[SCEVGEP23]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       entry:
+; CHECK:         [[MASKCOND:%.*]] = icmp eq i64 %maskedptr, 0
+; CHECK:         [[MASKCOND4:%.*]] = icmp eq i64 %maskedptr3, 0
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !alias.scope !4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4, !alias.scope !4
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_LOAD4]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP12]], <2 x float>* [[TMP17]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP13]], <2 x float>* [[TMP19]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
   %0 = load float*, float** %b, align 8
@@ -203,82 +89,9 @@ for.end:                                          ; preds = %for.body
 define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) {
 ; Check that the vector.body does not contain any assumes.
 ; CHECK-LABEL: @predicated_assume(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 495616, i64 495616>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <2 x i64> [[STEP_ADD]], <i64 495616, i64 495616>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 991232, i64 991232>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <2 x i64> [[STEP_ADD]], <i64 991232, i64 991232>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x float> <float 2.300000e+01, float 2.300000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <2 x i1> [[TMP4]], <2 x float> <float 2.300000e+01, float 2.300000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x float>, <2 x float>* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP15]], <2 x float>* [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP16]], <2 x float>* [[TMP22]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
+; CHECK-NOT:     llvm.assume
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT:    br label [[IF_END5]]
-; CHECK:       if.end5:
-; CHECK-NEXT:    [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X_0]], [[TMP24]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-;
 entry:
   %cmp15 = icmp eq i32 %n, 0
   br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader

diff  --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 01feb8e92065e..50b64d86c2303 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -11,140 +11,14 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Function Attrs: nofree norecurse nounwind uwtable
 define dso_local void @_Z3foov() local_unnamed_addr #0 {
 ; CHECK-LABEL: @_Z3foov(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !tbaa [[TBAA2:![0-9]+]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[MUL]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
-;
-; CHECK-MASKED-LABEL: @_Z3foov(
-; CHECK-MASKED-NEXT:  entry:
-; CHECK-MASKED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MASKED:       vector.ph:
-; CHECK-MASKED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MASKED:       vector.body:
-; CHECK-MASKED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i32> [[STEP_ADD1]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MASKED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MASKED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MASKED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MASKED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-MASKED-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2:![0-9]+]]
-; CHECK-MASKED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; CHECK-MASKED-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; CHECK-MASKED-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; CHECK-MASKED-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-MASKED-NEXT:    [[TMP17:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD4]], [[STEP_ADD]]
-; CHECK-MASKED-NEXT:    [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD5]], [[STEP_ADD1]]
-; CHECK-MASKED-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD6]], [[STEP_ADD2]]
-; CHECK-MASKED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-MASKED-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
-; CHECK-MASKED-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
-; CHECK-MASKED-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
-; CHECK-MASKED-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD7]], [[TMP16]]
-; CHECK-MASKED-NEXT:    [[TMP33:%.*]] = add nsw <4 x i32> [[WIDE_LOAD8]], [[TMP17]]
-; CHECK-MASKED-NEXT:    [[TMP34:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[TMP18]]
-; CHECK-MASKED-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP19]]
-; CHECK-MASKED-NEXT:    [[TMP36:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP36]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP33]], <4 x i32>* [[TMP37]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP38:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP34]], <4 x i32>* [[TMP38]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP39]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MASKED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MASKED-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-MASKED:       middle.block:
-; CHECK-MASKED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-MASKED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-MASKED:       scalar.ph:
-; CHECK-MASKED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MASKED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-MASKED:       for.cond.cleanup:
-; CHECK-MASKED-NEXT:    ret void
-; CHECK-MASKED:       for.body:
-; CHECK-MASKED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP42:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-MASKED-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
-; CHECK-MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP43]], [[MUL]]
-; CHECK-MASKED-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-MASKED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-MASKED-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
+; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
+; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
+; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
 ;
 entry:
   br label %for.body
@@ -171,140 +45,14 @@ for.body:                                         ; preds = %for.body, %entry
 ; Function Attrs: nofree norecurse nounwind uwtable
 define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
 ; CHECK-LABEL: @_Z3foo2v(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1027, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[MUL]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1027
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF13:![0-9]+]], !llvm.loop [[LOOP14:![0-9]+]]
-;
-; CHECK-MASKED-LABEL: @_Z3foo2v(
-; CHECK-MASKED-NEXT:  entry:
-; CHECK-MASKED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MASKED:       vector.ph:
-; CHECK-MASKED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MASKED:       vector.body:
-; CHECK-MASKED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i32> [[STEP_ADD1]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MASKED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MASKED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MASKED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MASKED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-MASKED-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; CHECK-MASKED-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; CHECK-MASKED-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; CHECK-MASKED-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-MASKED-NEXT:    [[TMP17:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD4]], [[STEP_ADD]]
-; CHECK-MASKED-NEXT:    [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD5]], [[STEP_ADD1]]
-; CHECK-MASKED-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD6]], [[STEP_ADD2]]
-; CHECK-MASKED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-MASKED-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
-; CHECK-MASKED-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
-; CHECK-MASKED-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
-; CHECK-MASKED-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD7]], [[TMP16]]
-; CHECK-MASKED-NEXT:    [[TMP33:%.*]] = add nsw <4 x i32> [[WIDE_LOAD8]], [[TMP17]]
-; CHECK-MASKED-NEXT:    [[TMP34:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[TMP18]]
-; CHECK-MASKED-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP19]]
-; CHECK-MASKED-NEXT:    [[TMP36:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP36]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP33]], <4 x i32>* [[TMP37]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP38:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP34]], <4 x i32>* [[TMP38]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP39]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MASKED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MASKED-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-MASKED:       middle.block:
-; CHECK-MASKED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1027, 1024
-; CHECK-MASKED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-MASKED:       scalar.ph:
-; CHECK-MASKED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MASKED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-MASKED:       for.cond.cleanup:
-; CHECK-MASKED-NEXT:    ret void
-; CHECK-MASKED:       for.body:
-; CHECK-MASKED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[TMP42:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-MASKED-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
-; CHECK-MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP43]], [[MUL]]
-; CHECK-MASKED-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-MASKED-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1027
-; CHECK-MASKED-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF13:![0-9]+]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
+; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
+; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
+; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
 ;
 entry:
   br label %for.body
@@ -332,6 +80,11 @@ attributes #0 = { "use-soft-float"="false" }
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
+; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
+; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}

diff  --git a/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
index d28ab183b0f78..be6de28b83f19 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
 ; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
 
@@ -8,25 +7,6 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind ssp uwtable
 define void @conditional_store(i32* noalias nocapture %indices) #0 !dbg !4 {
-; CHECK-LABEL: @conditional_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG8:![0-9]+]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[INDICES:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG10]], !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 1024, !dbg [[DBG10]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]], !dbg [[DBG10]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i32 0, i32* [[ARRAYIDX]], align 4, !dbg [[DBG16:![0-9]+]], !tbaa [[TBAA12]]
-; CHECK-NEXT:    br label [[FOR_INC]], !dbg [[DBG16]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG8]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096, !dbg [[DBG8]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !dbg [[DBG8]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void, !dbg [[DBG17:![0-9]+]]
-;
 entry:
   br label %for.body, !dbg !10
 

diff  --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index e21721272df56..9bd6b9b56eb0d 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
 ; RUN: opt -S -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
 
@@ -36,37 +35,6 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
-; CHECK-LABEL: @cold(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG11:![0-9]+]], !prof [[PROF12:![0-9]+]]
-; CHECK:       ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG13]], !tbaa [[TBAA14:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG18:![0-9]+]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG11]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG21]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG22:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG22]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG11]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG11]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG11]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG28:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void, !dbg [[DBG28]]
-;
 entry:
   %cmp28 = icmp sgt i32 %N, 0, !dbg !9
   br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
@@ -101,37 +69,6 @@ for.cond.cleanup:
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
-; CHECK-LABEL: @hot(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG31:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG32:![0-9]+]], !prof [[PROF12]]
-; CHECK:       ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG33:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG33]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG34:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG34]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG35:![0-9]+]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG32]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG36:![0-9]+]]
-; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG38:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG38]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG39:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG39]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG40:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG41:![0-9]+]]
-; CHECK-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG42:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG32]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG32]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG32]], !prof [[PROF26]], !llvm.loop [[LOOP43:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG44:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void, !dbg [[DBG44]]
-;
 entry:
   %cmp28 = icmp sgt i32 %N, 0, !dbg !27
   br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
@@ -166,37 +103,6 @@ for.cond.cleanup:
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
-; CHECK-LABEL: @unknown(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG46:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG47:![0-9]+]]
-; CHECK:       ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG48:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG48]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG49:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG49]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG50:![0-9]+]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG47]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG51:![0-9]+]]
-; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG53:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG53]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG54:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG54]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG55:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG56:![0-9]+]]
-; CHECK-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG47]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG47]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG47]], !llvm.loop [[LOOP58:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG59:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void, !dbg [[DBG59]]
-;
 entry:
   %cmp28 = icmp sgt i32 %N, 0, !dbg !42
   br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !43

diff  --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 66771130db6a6..9e37121489bec 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -pass-remarks-missed=loop-vectorize \
 ; RUN:     -pass-remarks-with-hotness < %s 2>&1 | \
 ; RUN:     FileCheck -check-prefix=HOTNESS -check-prefix=BOTH %s
@@ -49,68 +48,6 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
-; HOTNESS-LABEL: @cold(
-; HOTNESS-NEXT:  entry:
-; HOTNESS-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG10:![0-9]+]]
-; HOTNESS-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG11:![0-9]+]], !prof [[PROF12:![0-9]+]]
-; HOTNESS:       ph:
-; HOTNESS-NEXT:    br label [[FOR_BODY:%.*]]
-; HOTNESS:       for.body:
-; HOTNESS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; HOTNESS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG13:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG13]], !tbaa [[TBAA14:![0-9]+]]
-; HOTNESS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG17:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG18:![0-9]+]]
-; HOTNESS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG11]]
-; HOTNESS-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG19:![0-9]+]]
-; HOTNESS-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG21]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG22:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG22]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG23:![0-9]+]]
-; HOTNESS-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG24:![0-9]+]]
-; HOTNESS-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG11]]
-; HOTNESS-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG11]]
-; HOTNESS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG11]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]]
-; HOTNESS:       for.cond.cleanup.loopexit:
-; HOTNESS-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG28:![0-9]+]]
-; HOTNESS:       for.cond.cleanup:
-; HOTNESS-NEXT:    ret void, !dbg [[DBG28]]
-;
-; NO_HOTNESS-LABEL: @cold(
-; NO_HOTNESS-NEXT:  entry:
-; NO_HOTNESS-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG10:![0-9]+]]
-; NO_HOTNESS-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG11:![0-9]+]], !prof [[PROF12:![0-9]+]]
-; NO_HOTNESS:       ph:
-; NO_HOTNESS-NEXT:    br label [[FOR_BODY:%.*]]
-; NO_HOTNESS:       for.body:
-; NO_HOTNESS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG13:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG13]], !tbaa [[TBAA14:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG17:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG18:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG11]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG19:![0-9]+]]
-; NO_HOTNESS-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG21]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG22:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG22]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG23:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG24:![0-9]+]]
-; NO_HOTNESS-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG11]]
-; NO_HOTNESS-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG11]]
-; NO_HOTNESS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG11]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]]
-; NO_HOTNESS:       for.cond.cleanup.loopexit:
-; NO_HOTNESS-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG28:![0-9]+]]
-; NO_HOTNESS:       for.cond.cleanup:
-; NO_HOTNESS-NEXT:    ret void, !dbg [[DBG28]]
-;
 entry:
   %cmp28 = icmp sgt i32 %N, 0, !dbg !9
   br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
@@ -145,68 +82,6 @@ for.cond.cleanup:
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
-; HOTNESS-LABEL: @hot(
-; HOTNESS-NEXT:  entry:
-; HOTNESS-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG31:![0-9]+]]
-; HOTNESS-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG32:![0-9]+]], !prof [[PROF12]]
-; HOTNESS:       ph:
-; HOTNESS-NEXT:    br label [[FOR_BODY:%.*]]
-; HOTNESS:       for.body:
-; HOTNESS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; HOTNESS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG33:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG33]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG34:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG34]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG35:![0-9]+]]
-; HOTNESS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG32]]
-; HOTNESS-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG36:![0-9]+]]
-; HOTNESS-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG38:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG38]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG39:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG39]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG40:![0-9]+]]
-; HOTNESS-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG41:![0-9]+]]
-; HOTNESS-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG42:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG32]]
-; HOTNESS-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG32]]
-; HOTNESS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG32]], !prof [[PROF26]], !llvm.loop [[LOOP43:![0-9]+]]
-; HOTNESS:       for.cond.cleanup.loopexit:
-; HOTNESS-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG44:![0-9]+]]
-; HOTNESS:       for.cond.cleanup:
-; HOTNESS-NEXT:    ret void, !dbg [[DBG44]]
-;
-; NO_HOTNESS-LABEL: @hot(
-; NO_HOTNESS-NEXT:  entry:
-; NO_HOTNESS-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG31:![0-9]+]]
-; NO_HOTNESS-NEXT:    br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG32:![0-9]+]], !prof [[PROF12]]
-; NO_HOTNESS:       ph:
-; NO_HOTNESS-NEXT:    br label [[FOR_BODY:%.*]]
-; NO_HOTNESS:       for.body:
-; NO_HOTNESS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG33:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG33]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG34:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG34]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG35:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG32]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG36:![0-9]+]]
-; NO_HOTNESS-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG38:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG38]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG39:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG39]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG40:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG41:![0-9]+]]
-; NO_HOTNESS-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG42:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG32]]
-; NO_HOTNESS-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG32]]
-; NO_HOTNESS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG32]], !prof [[PROF26]], !llvm.loop [[LOOP43:![0-9]+]]
-; NO_HOTNESS:       for.cond.cleanup.loopexit:
-; NO_HOTNESS-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG44:![0-9]+]]
-; NO_HOTNESS:       for.cond.cleanup:
-; NO_HOTNESS-NEXT:    ret void, !dbg [[DBG44]]
-;
 entry:
   %cmp28 = icmp sgt i32 %N, 0, !dbg !27
   br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
@@ -241,68 +116,6 @@ for.cond.cleanup:
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
-; HOTNESS-LABEL: @unknown(
-; HOTNESS-NEXT:  entry:
-; HOTNESS-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG46:![0-9]+]]
-; HOTNESS-NEXT:    br i1 [[CMP28]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG47:![0-9]+]]
-; HOTNESS:       for.body.preheader:
-; HOTNESS-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG47]]
-; HOTNESS:       for.cond.cleanup.loopexit:
-; HOTNESS-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG48:![0-9]+]]
-; HOTNESS:       for.cond.cleanup:
-; HOTNESS-NEXT:    ret void, !dbg [[DBG48]]
-; HOTNESS:       for.body:
-; HOTNESS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; HOTNESS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG49:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG49]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG50:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG50]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG51:![0-9]+]]
-; HOTNESS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG47]]
-; HOTNESS-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG52:![0-9]+]]
-; HOTNESS-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG53:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG54:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG54]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG55:![0-9]+]]
-; HOTNESS-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG55]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG56:![0-9]+]]
-; HOTNESS-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG57:![0-9]+]]
-; HOTNESS-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG47]]
-; HOTNESS-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG47]]
-; HOTNESS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG47]], !llvm.loop [[LOOP59:![0-9]+]]
-;
-; NO_HOTNESS-LABEL: @unknown(
-; NO_HOTNESS-NEXT:  entry:
-; NO_HOTNESS-NEXT:    [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG46:![0-9]+]]
-; NO_HOTNESS-NEXT:    br i1 [[CMP28]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG47:![0-9]+]]
-; NO_HOTNESS:       for.body.preheader:
-; NO_HOTNESS-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG47]]
-; NO_HOTNESS:       for.cond.cleanup.loopexit:
-; NO_HOTNESS-NEXT:    br label [[FOR_COND_CLEANUP]], !dbg [[DBG48:![0-9]+]]
-; NO_HOTNESS:       for.cond.cleanup:
-; NO_HOTNESS-NEXT:    ret void, !dbg [[DBG48]]
-; NO_HOTNESS:       for.body:
-; NO_HOTNESS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG49:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG49]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG50:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG50]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG51:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG47]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG52:![0-9]+]]
-; NO_HOTNESS-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG53:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG54:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG54]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG55:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG55]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG56:![0-9]+]]
-; NO_HOTNESS-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG57:![0-9]+]]
-; NO_HOTNESS-NEXT:    store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG47]]
-; NO_HOTNESS-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG47]]
-; NO_HOTNESS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG47]], !llvm.loop [[LOOP59:![0-9]+]]
-;
 entry:
   %cmp28 = icmp sgt i32 %N, 0, !dbg !42
   br i1 %cmp28, label %for.body, label %for.cond.cleanup, !dbg !43

diff  --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
index 9258b1489727e..28cb7eb057007 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
@@ -31,7 +31,7 @@ define dso_local void @constTC(i32* noalias nocapture %A) optsize {
 ; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, <2 x i32>* [[TMP11]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1800, 1800
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -44,7 +44,7 @@ define dso_local void @constTC(i32* noalias nocapture %A) optsize {
 ; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
index 8176940280fd4..daa4548eb427b 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -25,7 +25,7 @@ define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize {
 ; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[ALIGNEDTC]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -38,7 +38,7 @@ define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize {
 ; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[ALIGNEDTC]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -91,7 +91,7 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q
 ; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -104,7 +104,7 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q
 ; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -202,7 +202,7 @@ define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -214,7 +214,7 @@ define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i
 ; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:

diff  --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index c385be31eba9e..57ddff65ff90a 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -44,15 +44,15 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX6]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12]] = add <4 x i64> [[WIDE_LOAD8]], [[VEC_PHI7]]
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -141,16 +141,16 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x float> [[MINMAX_IDENT_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX6]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI7]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[TMP13]] = select <4 x i1> [[TMP12]], <4 x float> [[VEC_PHI7]], <4 x float> [[WIDE_LOAD8]]
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -222,7 +222,7 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP9]] = zext <4 x i16> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]])
@@ -237,9 +237,9 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI4]], <i32 65535, i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0
@@ -247,11 +247,11 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX3]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT6]], 256
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32>
-; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
@@ -272,7 +272,7 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[XOR]] = or i32 [[SUM_02]], [[EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[XOR_LCSSA1:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
@@ -329,7 +329,7 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP5]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
@@ -349,19 +349,19 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX9]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP15]] = fadd fast <4 x float> [[VEC_PHI11]], [[WIDE_LOAD12]]
 ; CHECK-NEXT:    [[TMP16]] = fmul fast <4 x float> [[VEC_PHI10]], [[WIDE_LOAD12]]
-; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC7]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]])
@@ -382,7 +382,7 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[MUL]] = fmul fast float [[PROD]], [[TMP20]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA5:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[MUL_LCSSA4:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
@@ -444,7 +444,7 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -461,17 +461,17 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13]] = sub <4 x i32> [[VEC_PHI7]], [[WIDE_LOAD8]]
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]]
@@ -488,7 +488,7 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       for.cond.loopexit:
 ; CHECK-NEXT:    [[SUB_LCSSA2:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND]]

diff  --git a/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll b/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
index df3c4f9a8f7aa..2a19c0a95b5d6 100644
--- a/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
@@ -1,34 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
 ; Avoid crashing while trying to vectorize fcmp that can be folded to vector of
 ; i1 true.
 define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 144
-; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 144, 144
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 144, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IVNEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FCMP:%.*]] = fcmp uno float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[IVNEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[CND:%.*]] = icmp sgt i32 [[IV]], 142
-; CHECK-NEXT:    br i1 [[CND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: test1(
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         %index.next = add nuw i32 %index, 4
 
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index cc7f766e648ae..6d2544ae4600e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -1,23 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
 define void @test_chained_first_order_recurrences_1(i16* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, [[ENTRY:%.*]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT:    store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test_chained_first_order_recurrences_1
+; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop
@@ -39,22 +24,8 @@ exit:
 }
 
 define void @test_chained_first_order_recurrences_2(i16* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ 33, [[ENTRY:%.*]] ], [ [[FOR_1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_1]] = phi i16 [ 22, [[ENTRY]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT:    store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test_chained_first_order_recurrences_2
+; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop
@@ -76,24 +47,8 @@ exit:
 }
 
 define void @test_chained_first_order_recurrences_3(i16* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, [[ENTRY:%.*]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_2]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
-; CHECK-NEXT:    store i16 [[ADD_2]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test_chained_first_order_recurrences_3
+; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop
@@ -118,22 +73,8 @@ exit:
 
 
 define void @test_cyclic_phis(i16* %ptr) {
-; CHECK-LABEL: @test_cyclic_phis(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, [[ENTRY:%.*]] ], [ [[FOR_2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_2]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[FOR_1_NEXT:%.*]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT:    store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test_cyclic_phis
+; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop
@@ -155,18 +96,10 @@ exit:
 }
 
 define void @test_first_order_recurrences_incoming_cycle_preheader(i16* %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_incoming_cycle_preheader(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP_1:%.*]]
-; CHECK:       loop.1:
-; CHECK-NEXT:    br i1 true, label [[LOOP_PREHEADER:%.*]], label [[LOOP_1]]
-; CHECK:       loop.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: @test_first_order_recurrences_incoming_cycle_preheader
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0
@@ -178,28 +111,7 @@ define void @test_first_order_recurrences_incoming_cycle_preheader(i16* %ptr) {
 ; CHECK-NEXT:    store <4 x i16> [[TMP5]], <4 x i16>* [[TMP6]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[FOR_1_NEXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 [[IV]]
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[SCALAR_RECUR]], 10
-; CHECK-NEXT:    store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 [[TMP7]], label %middle.block, label %vector.body
 ;
 entry:
   br label %loop.1
@@ -224,23 +136,8 @@ exit:
 }
 
 define void @test_chained_first_order_recurrence_sink_users_1(double* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ 1.000000e+01, [[ENTRY:%.*]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ 2.000000e+01, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double 1.000000e+01, [[FOR_2]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd double [[ADD_1]], [[FOR_1]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load double, double* [[GEP_PTR]], align 8
-; CHECK-NEXT:    store double [[ADD_2]], double* [[GEP_PTR]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1
+; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index 88d8e94560a5c..6ffccd9cb3604 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -731,7 +731,7 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
 ; CHECK-NEXT:    store <4 x i16> [[TMP8]], <4 x i16>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
@@ -752,7 +752,7 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
 ; CHECK-NEXT:    store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ]
 ; CHECK-NEXT:    ret i16 [[REC_LCSSA]]
@@ -810,7 +810,7 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
 ; CHECK-NEXT:    store <4 x i16> [[TMP8]], <4 x i16>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
@@ -831,7 +831,7 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
 ; CHECK-NEXT:    store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i16 [[REC_LCSSA]]
@@ -869,34 +869,34 @@ define void @sink_dominance(i32* %ptr, i32 %N) {
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[UMAX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt <4 x i32> [[TMP8]], <i32 213, i32 213, i32 213, i32 213>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP11]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt <4 x i32> [[TMP13]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP13]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP11]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP11]], i32 2
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -914,7 +914,7 @@ define void @sink_dominance(i32* %ptr, i32 %N) {
 ; CHECK-NEXT:    store i32 [[SELECT]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -953,36 +953,36 @@ define void @sink_dominance_2(i32* %ptr, i32 %N) {
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[UMAX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], <i32 99, i32 99, i32 99, i32 99>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[TMP8]], <i32 213, i32 213, i32 213, i32 213>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP11]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], <i32 99, i32 99, i32 99, i32 99>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp slt <4 x i32> [[TMP13]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[TMP13]], <4 x i32> [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP11]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP11]], i32 2
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1002,7 +1002,7 @@ define void @sink_dominance_2(i32* %ptr, i32 %N) {
 ; CHECK-NEXT:    store i32 [[SELECT]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index ac4df483fdc84..7da1b810e22d3 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -47,12 +47,12 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -111,20 +111,20 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n)
 ; UNROLL-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
 ; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 4
 ; UNROLL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; UNROLL-NEXT:    [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0
+; UNROLL-NEXT:    [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
 ; UNROLL-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[TMP13]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
 ; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 4
 ; UNROLL-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -194,10 +194,10 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n)
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !alias.scope !0
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !0
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]]
@@ -206,10 +206,10 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n)
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[TMP18]]
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* [[TMP24]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* [[TMP24]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP22]], <4 x i32>* [[TMP26]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP22]], <4 x i32>* [[TMP26]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -277,14 +277,14 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n)
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4, !alias.scope !0
-; UNROLL-NO-VF-NEXT:    [[TMP12]] = load i32, i32* [[TMP10]], align 4, !alias.scope !0
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP12]] = load i32, i32* [[TMP10]], align 4
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]]
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = add i32 [[TMP11]], [[VECTOR_RECUR]]
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = add i32 [[TMP12]], [[TMP11]]
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP15]], i32* [[TMP13]], align 4, !alias.scope !3, !noalias !0
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP16]], i32* [[TMP14]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP15]], i32* [[TMP13]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP16]], i32* [[TMP14]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -350,13 +350,13 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n)
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]]
 ; SINK-AFTER-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !alias.scope !3, !noalias !0
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -650,11 +650,11 @@ define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION2]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION1]]
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
 ; UNROLL-NO-VF-NEXT:    [[TMP6]] = load i32, i32* [[TMP4]], align 4
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub nsw i32 [[TMP5]], [[VECTOR_RECUR]]
@@ -664,9 +664,9 @@ define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i32 [[TMP7]], i32 0
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 0
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[VEC_PHI]], [[TMP11]]
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = icmp slt i32 [[VEC_PHI1]], [[TMP12]]
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = icmp slt i32 [[VEC_PHI2]], [[TMP12]]
 ; UNROLL-NO-VF-NEXT:    [[TMP15]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
-; UNROLL-NO-VF-NEXT:    [[TMP16]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
+; UNROLL-NO-VF-NEXT:    [[TMP16]] = select i1 [[TMP14]], i32 [[VEC_PHI2]], i32 [[TMP12]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -858,7 +858,7 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
 ; CHECK-NEXT:    [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double>
@@ -866,7 +866,7 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; CHECK-NEXT:    [[TMP15:%.*]] = fsub fast <4 x double> [[TMP12]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8, !alias.scope !14, !noalias !11
+; CHECK-NEXT:    store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -944,10 +944,10 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[OFFSET_IDX]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
 ; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i64 4
 ; UNROLL-NEXT:    [[TMP12:%.*]] = bitcast i16* [[TMP11]] to <4 x i16>*
-; UNROLL-NEXT:    [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2, !alias.scope !11
+; UNROLL-NEXT:    [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2
 ; UNROLL-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP15:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
@@ -960,10 +960,10 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; UNROLL-NEXT:    [[TMP22:%.*]] = fsub fast <4 x double> [[TMP16]], [[TMP20]]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[OFFSET_IDX]]
 ; UNROLL-NEXT:    [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>*
-; UNROLL-NEXT:    store <4 x double> [[TMP21]], <4 x double>* [[TMP24]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NEXT:    store <4 x double> [[TMP21]], <4 x double>* [[TMP24]], align 8
 ; UNROLL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP23]], i64 4
 ; UNROLL-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
-; UNROLL-NEXT:    store <4 x double> [[TMP22]], <4 x double>* [[TMP26]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NEXT:    store <4 x double> [[TMP22]], <4 x double>* [[TMP26]], align 8
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1048,10 +1048,10 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = bitcast i16* [[TMP11]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2, !alias.scope !11
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP13]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP14]], align 2, !alias.scope !11
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
@@ -1066,10 +1066,10 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP25]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = bitcast double* [[TMP27]] to <4 x double>*
-; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP23]], <4 x double>* [[TMP28]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP23]], <4 x double>* [[TMP28]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP25]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>*
-; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP24]], <4 x double>* [[TMP30]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP24]], <4 x double>* [[TMP30]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1148,8 +1148,8 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; UNROLL-NO-VF-NEXT:    [[INDUCTION8:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION8]]
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !10
-; UNROLL-NO-VF-NEXT:    [[TMP10]] = load i16, i16* [[TMP8]], align 2, !alias.scope !10
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 2
+; UNROLL-NO-VF-NEXT:    [[TMP10]] = load i16, i16* [[TMP8]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = sitofp i16 [[TMP9]] to double
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = sitofp i16 [[TMP10]] to double
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double
@@ -1160,8 +1160,8 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = fsub fast double [[TMP12]], [[TMP16]]
 ; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION8]]
-; UNROLL-NO-VF-NEXT:    store double [[TMP17]], double* [[TMP19]], align 8, !alias.scope !13, !noalias !10
-; UNROLL-NO-VF-NEXT:    store double [[TMP18]], double* [[TMP20]], align 8, !alias.scope !13, !noalias !10
+; UNROLL-NO-VF-NEXT:    store double [[TMP17]], double* [[TMP19]], align 8
+; UNROLL-NO-VF-NEXT:    store double [[TMP18]], double* [[TMP20]], align 8
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -1241,7 +1241,7 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP7]]
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double>
@@ -1250,7 +1250,7 @@ define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]]
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP16]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>*
-; SINK-AFTER-NEXT:    store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8, !alias.scope !14, !noalias !11
+; SINK-AFTER-NEXT:    store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1969,10 +1969,10 @@ define void @constant_folded_previous_value() {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ <i64 1, i64 1, i64 1, i64 1>, [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -2028,10 +2028,10 @@ define void @constant_folded_previous_value() {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ <i64 1, i64 1, i64 1, i64 1>, [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; SINK-AFTER-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -2130,20 +2130,20 @@ define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP1]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT3]]
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP11]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT3]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP10]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, 96
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -2168,19 +2168,19 @@ define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-VF-NEXT:    [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[INDUCTION]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP1]] = add i32 [[INDUCTION1]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[INDUCTION]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[INDUCTION1]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, 96
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
@@ -2192,7 +2192,7 @@ define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
 ; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[VAL_PHI_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
@@ -2205,17 +2205,17 @@ define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; SINK-AFTER-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, 96
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -2415,9 +2415,9 @@ define i32 @PR33613(double* %b, double %j, i32 %d) {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 25
 ; UNROLL-NO-IC-NEXT:    [[NEXT_GEP:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
@@ -2442,67 +2442,67 @@ define i32 @PR33613(double* %b, double %j, i32 %d) {
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 7
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 25
 ; UNROLL-NO-IC-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr double, double* [[B]], i64 [[TMP15]]
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP5]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP6]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP7]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP8]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load double, double* [[TMP16]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load double, double* [[TMP17]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP5]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP6]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP7]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP8]], i64 [[IDXPROM]]
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load double, double* [[TMP18]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = load double, double* [[TMP19]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = insertelement <4 x double> poison, double [[TMP24]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = insertelement <4 x double> [[TMP28]], double [[TMP25]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP26]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = insertelement <4 x double> [[TMP30]], double [[TMP27]], i32 3
-; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load double, double* [[TMP20]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load double, double* [[TMP21]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load double, double* [[TMP20]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load double, double* [[TMP21]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x double> poison, double [[TMP26]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = insertelement <4 x double> [[TMP30]], double [[TMP27]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x double> [[TMP31]], double [[TMP28]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = insertelement <4 x double> [[TMP32]], double [[TMP29]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load double, double* [[TMP22]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = load double, double* [[TMP23]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x double> poison, double [[TMP32]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x double> [[TMP36]], double [[TMP33]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x double> [[TMP37]], double [[TMP34]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP39]] = insertelement <4 x double> [[TMP38]], double [[TMP35]], i32 3
-; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP31]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> [[TMP39]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = fmul <4 x double> [[TMP40]], [[TMP31]]
-; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = fmul <4 x double> [[TMP41]], [[TMP39]]
-; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = fcmp une <4 x double> [[TMP42]], zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = fcmp une <4 x double> [[TMP43]], zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = zext <4 x i1> [[TMP44]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = zext <4 x i1> [[TMP45]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP48]] = add <4 x i32> [[VEC_PHI]], [[TMP46]]
-; UNROLL-NO-IC-NEXT:    [[TMP49]] = add <4 x i32> [[VEC_PHI9]], [[TMP47]]
+; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = load double, double* [[TMP24]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = load double, double* [[TMP25]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x double> poison, double [[TMP34]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = insertelement <4 x double> [[TMP38]], double [[TMP35]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x double> [[TMP39]], double [[TMP36]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP41]] = insertelement <4 x double> [[TMP40]], double [[TMP37]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP33]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> [[TMP41]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = fmul <4 x double> [[TMP42]], [[TMP33]]
+; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = fmul <4 x double> [[TMP43]], [[TMP41]]
+; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = fcmp une <4 x double> [[TMP44]], zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = fcmp une <4 x double> [[TMP45]], zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = zext <4 x i1> [[TMP46]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = zext <4 x i1> [[TMP47]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP50]] = add <4 x i32> [[VEC_PHI]], [[TMP48]]
+; UNROLL-NO-IC-NEXT:    [[TMP51]] = add <4 x i32> [[VEC_PHI9]], [[TMP49]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]]
-; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
+; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10240, 10240
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP39]], i32 3
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP39]], i32 2
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP41]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP41]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
-; UNROLL-NO-IC-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[A_1_LCSSA]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP52:%.*]], [[FOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP54:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT:    [[TMP52]] = load double, double* [[ARRAYIDX]], align 8
-; UNROLL-NO-IC-NEXT:    [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP52]]
+; UNROLL-NO-IC-NEXT:    [[TMP54]] = load double, double* [[ARRAYIDX]], align 8
+; UNROLL-NO-IC-NEXT:    [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP54]]
 ; UNROLL-NO-IC-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
 ; UNROLL-NO-IC-NEXT:    [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
 ; UNROLL-NO-IC-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
@@ -2521,7 +2521,7 @@ define i32 @PR33613(double* %b, double %j, i32 %d) {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VEC_PHI4:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi double [ [[J:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 25
@@ -2540,7 +2540,7 @@ define i32 @PR33613(double* %b, double %j, i32 %d) {
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP10]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP11]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP14]] = add i32 [[VEC_PHI]], [[TMP12]]
-; UNROLL-NO-VF-NEXT:    [[TMP15]] = add i32 [[VEC_PHI3]], [[TMP13]]
+; UNROLL-NO-VF-NEXT:    [[TMP15]] = add i32 [[VEC_PHI4]], [[TMP13]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
@@ -2583,8 +2583,8 @@ define i32 @PR33613(double* %b, double %j, i32 %d) {
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 25
 ; SINK-AFTER-NEXT:    [[NEXT_GEP:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
@@ -2597,49 +2597,49 @@ define i32 @PR33613(double* %b, double %j, i32 %d) {
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 3
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 25
 ; SINK-AFTER-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr double, double* [[B]], i64 [[TMP7]]
-; SINK-AFTER-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP8]], align 8
+; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
+; SINK-AFTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = load double, double* [[TMP9]], align 8
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = load double, double* [[TMP10]], align 8
 ; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load double, double* [[TMP11]], align 8
-; SINK-AFTER-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i32 0
-; SINK-AFTER-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP13]], i32 1
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP14]], i32 2
-; SINK-AFTER-NEXT:    [[TMP19]] = insertelement <4 x double> [[TMP18]], double [[TMP15]], i32 3
-; SINK-AFTER-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP19]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP21:%.*]] = fmul <4 x double> [[TMP20]], [[TMP19]]
-; SINK-AFTER-NEXT:    [[TMP22:%.*]] = fcmp une <4 x double> [[TMP21]], zeroinitializer
-; SINK-AFTER-NEXT:    [[TMP23:%.*]] = zext <4 x i1> [[TMP22]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
+; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load double, double* [[TMP12]], align 8
+; SINK-AFTER-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> poison, double [[TMP13]], i32 0
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP14]], i32 1
+; SINK-AFTER-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP18]], double [[TMP15]], i32 2
+; SINK-AFTER-NEXT:    [[TMP20]] = insertelement <4 x double> [[TMP19]], double [[TMP16]], i32 3
+; SINK-AFTER-NEXT:    [[TMP21:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP20]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP22:%.*]] = fmul <4 x double> [[TMP21]], [[TMP20]]
+; SINK-AFTER-NEXT:    [[TMP23:%.*]] = fcmp une <4 x double> [[TMP22]], zeroinitializer
+; SINK-AFTER-NEXT:    [[TMP24:%.*]] = zext <4 x i1> [[TMP23]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; SINK-AFTER-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
+; SINK-AFTER-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
+; SINK-AFTER-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10240, 10240
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP19]], i32 3
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP19]], i32 2
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
-; SINK-AFTER-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    ret i32 [[A_1_LCSSA]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT:    [[TMP27]] = load double, double* [[ARRAYIDX]], align 8
-; SINK-AFTER-NEXT:    [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP27]]
+; SINK-AFTER-NEXT:    [[TMP28]] = load double, double* [[ARRAYIDX]], align 8
+; SINK-AFTER-NEXT:    [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP28]]
 ; SINK-AFTER-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
 ; SINK-AFTER-NEXT:    [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
 ; SINK-AFTER-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
@@ -2707,14 +2707,14 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !alias.scope !29, !noalias !26
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
@@ -2768,10 +2768,10 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
 ; UNROLL-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i64 4
 ; UNROLL-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; UNROLL-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2, !alias.scope !26
+; UNROLL-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2
 ; UNROLL-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP10:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
@@ -2782,10 +2782,10 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP11]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4
 ; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i64 4
 ; UNROLL-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP19]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP19]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
@@ -2847,10 +2847,10 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2, !alias.scope !26
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !26
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
@@ -2863,10 +2863,10 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
@@ -2926,8 +2926,8 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2, !alias.scope !25
-; UNROLL-NO-VF-NEXT:    [[TMP6]] = load i16, i16* [[TMP4]], align 2, !alias.scope !25
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2
+; UNROLL-NO-VF-NEXT:    [[TMP6]] = load i16, i16* [[TMP4]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sext i16 [[TMP5]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = sext i16 [[TMP5]] to i32
@@ -2936,8 +2936,8 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]]
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4, !alias.scope !28, !noalias !25
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP12]], i32* [[TMP14]], align 4, !alias.scope !28, !noalias !25
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP12]], i32* [[TMP14]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -2995,7 +2995,7 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
@@ -3003,7 +3003,7 @@ define void @sink_after(i16* %a, i32* %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4, !alias.scope !29, !noalias !26
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
@@ -3114,11 +3114,11 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP12]], align 4, !alias.scope !33, !noalias !36
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP9]], align 2, !alias.scope !39
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP10]], align 2, !alias.scope !39
-; CHECK-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP11]], align 2, !alias.scope !39
+; CHECK-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP9]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP11]], align 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i64 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i64 2
@@ -3129,7 +3129,7 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP26]], align 4, !alias.scope !40, !noalias !39
+; CHECK-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP26]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -3209,22 +3209,22 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP9]], i64 1
 ; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP10]], i64 1
 ; UNROLL-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP20]], align 4, !alias.scope !33, !noalias !36
+; UNROLL-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP20]], align 4
 ; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 4
 ; UNROLL-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP22]], align 4, !alias.scope !33, !noalias !36
-; UNROLL-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP12]], align 2, !alias.scope !39
-; UNROLL-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP13]], align 2, !alias.scope !39
-; UNROLL-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP14]], align 2, !alias.scope !39
-; UNROLL-NEXT:    [[TMP26:%.*]] = load i16, i16* [[TMP15]], align 2, !alias.scope !39
+; UNROLL-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP22]], align 4
+; UNROLL-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP12]], align 2
+; UNROLL-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP13]], align 2
+; UNROLL-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP14]], align 2
+; UNROLL-NEXT:    [[TMP26:%.*]] = load i16, i16* [[TMP15]], align 2
 ; UNROLL-NEXT:    [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[TMP23]], i64 0
 ; UNROLL-NEXT:    [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP24]], i64 1
 ; UNROLL-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP25]], i64 2
 ; UNROLL-NEXT:    [[TMP30:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP26]], i64 3
-; UNROLL-NEXT:    [[TMP31:%.*]] = load i16, i16* [[TMP16]], align 2, !alias.scope !39
-; UNROLL-NEXT:    [[TMP32:%.*]] = load i16, i16* [[TMP17]], align 2, !alias.scope !39
-; UNROLL-NEXT:    [[TMP33:%.*]] = load i16, i16* [[TMP18]], align 2, !alias.scope !39
-; UNROLL-NEXT:    [[TMP34:%.*]] = load i16, i16* [[TMP19]], align 2, !alias.scope !39
+; UNROLL-NEXT:    [[TMP31:%.*]] = load i16, i16* [[TMP16]], align 2
+; UNROLL-NEXT:    [[TMP32:%.*]] = load i16, i16* [[TMP17]], align 2
+; UNROLL-NEXT:    [[TMP33:%.*]] = load i16, i16* [[TMP18]], align 2
+; UNROLL-NEXT:    [[TMP34:%.*]] = load i16, i16* [[TMP19]], align 2
 ; UNROLL-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> poison, i16 [[TMP31]], i64 0
 ; UNROLL-NEXT:    [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP32]], i64 1
 ; UNROLL-NEXT:    [[TMP37:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[TMP33]], i64 2
@@ -3239,10 +3239,10 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; UNROLL-NEXT:    [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP44]], [[TMP42]]
 ; UNROLL-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP48:%.*]] = bitcast i32* [[TMP47]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP48]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP48]], align 4
 ; UNROLL-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, i32* [[TMP47]], i64 4
 ; UNROLL-NEXT:    [[TMP50:%.*]] = bitcast i32* [[TMP49]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP46]], <4 x i32>* [[TMP50]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NEXT:    store <4 x i32> [[TMP46]], <4 x i32>* [[TMP50]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -3328,22 +3328,22 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP7]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP19]], align 4, !alias.scope !33, !noalias !36
+; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP19]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP21]], align 4, !alias.scope !33, !noalias !36
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = load i16, i16* [[TMP10]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP11]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP12]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP13]], align 2, !alias.scope !39
+; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP21]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = load i16, i16* [[TMP10]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP12]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP13]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP22]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP23]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP24]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP25]], i32 3
-; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[TMP14]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i16, i16* [[TMP15]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i16, i16* [[TMP16]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i16, i16* [[TMP17]], align 2, !alias.scope !39
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[TMP14]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i16, i16* [[TMP15]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i16, i16* [[TMP16]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i16, i16* [[TMP17]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = insertelement <4 x i16> poison, i16 [[TMP30]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> [[TMP34]], i16 [[TMP31]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP32]], i32 2
@@ -3360,10 +3360,10 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP46]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP49]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP49]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP46]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP51]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP51]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -3436,10 +3436,10 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION17]]
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION]], i64 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION17]], i64 1
-; UNROLL-NO-VF-NEXT:    store i32 7, i32* [[TMP0]], align 4, !alias.scope !32, !noalias !35
-; UNROLL-NO-VF-NEXT:    store i32 7, i32* [[TMP1]], align 4, !alias.scope !32, !noalias !35
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2, !alias.scope !38
-; UNROLL-NO-VF-NEXT:    [[TMP5]] = load i16, i16* [[TMP3]], align 2, !alias.scope !38
+; UNROLL-NO-VF-NEXT:    store i32 7, i32* [[TMP0]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 7, i32* [[TMP1]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2
+; UNROLL-NO-VF-NEXT:    [[TMP5]] = load i16, i16* [[TMP3]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sext i16 [[TMP4]] to i32
@@ -3448,8 +3448,8 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = mul nsw i32 [[TMP9]], [[TMP7]]
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION17]]
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP10]], i32* [[TMP12]], align 4, !alias.scope !39, !noalias !38
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4, !alias.scope !39, !noalias !38
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP10]], i32* [[TMP12]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
@@ -3526,11 +3526,11 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP3]], i64 1
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; SINK-AFTER-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP10]], align 4, !alias.scope !33, !noalias !36
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2, !alias.scope !39
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2, !alias.scope !39
-; SINK-AFTER-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !39
-; SINK-AFTER-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39
+; SINK-AFTER-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP10]], align 4
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2
+; SINK-AFTER-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2
+; SINK-AFTER-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2
 ; SINK-AFTER-NEXT:    [[TMP15:%.*]] = insertelement <4 x i16> poison, i16 [[TMP11]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP12]], i32 1
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 2
@@ -3542,7 +3542,7 @@ define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
 ; SINK-AFTER-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4, !alias.scope !40, !noalias !39
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -3632,7 +3632,7 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
@@ -3640,7 +3640,7 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !alias.scope !46, !noalias !43
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
@@ -3695,10 +3695,10 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
 ; UNROLL-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i64 4
 ; UNROLL-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; UNROLL-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2, !alias.scope !43
+; UNROLL-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2
 ; UNROLL-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP10:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
@@ -3711,10 +3711,10 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP15]]
 ; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4
 ; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i64 4
 ; UNROLL-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; UNROLL-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP21]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP21]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
@@ -3777,10 +3777,10 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2, !alias.scope !43
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !43
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
@@ -3795,10 +3795,10 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* [[TMP24]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* [[TMP24]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP26]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP26]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
@@ -3859,8 +3859,8 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2, !alias.scope !42
-; UNROLL-NO-VF-NEXT:    [[TMP6]] = load i16, i16* [[TMP4]], align 2, !alias.scope !42
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2
+; UNROLL-NO-VF-NEXT:    [[TMP6]] = load i16, i16* [[TMP4]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sext i16 [[TMP5]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP7]], 2
@@ -3871,8 +3871,8 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP12]]
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]]
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP13]], i32* [[TMP15]], align 4, !alias.scope !45, !noalias !42
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP14]], i32* [[TMP16]], align 4, !alias.scope !45, !noalias !42
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP13]], i32* [[TMP15]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP14]], i32* [[TMP16]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
@@ -3931,7 +3931,7 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
@@ -3940,7 +3940,7 @@ define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4, !alias.scope !46, !noalias !43
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
@@ -4231,8 +4231,8 @@ define void @sink_dead_inst() {
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 43, 40
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
@@ -4266,20 +4266,20 @@ define void @sink_dead_inst() {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR2:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[TMP0]]
 ; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i16 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[INDUCTION1:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[INDUCTION]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[INDUCTION2]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[INDUCTION1]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP4]] = zext i16 [[TMP2]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP1]], 5
 ; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP2]], 5
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 43, 42
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -4318,8 +4318,8 @@ define void @sink_dead_inst() {
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; SINK-AFTER-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 43, 40
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
@@ -4453,17 +4453,17 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; UNROLL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE19:%.*]] ]
+; UNROLL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE19]] ]
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
-; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NEXT:    [[VEC_IV:%.*]] = or <4 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3>
-; UNROLL-NEXT:    [[VEC_IV4:%.*]] = or <4 x i32> [[BROADCAST_SPLAT3]], <i32 4, i32 5, i32 6, i32 7>
+; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; UNROLL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; UNROLL-NEXT:    [[VEC_IV:%.*]] = or <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; UNROLL-NEXT:    [[VEC_IV5:%.*]] = or <4 x i32> [[BROADCAST_SPLAT4]], <i32 4, i32 5, i32 6, i32 7>
 ; UNROLL-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV4]], [[BROADCAST_SPLAT]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV5]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
 ; UNROLL-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL:       pred.udiv.if:
@@ -4473,77 +4473,77 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL:       pred.udiv.continue:
 ; UNROLL-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
-; UNROLL-NEXT:    br i1 [[TMP8]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP8]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
 ; UNROLL:       pred.udiv.if5:
 ; UNROLL-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NEXT:    [[TMP10:%.*]] = udiv i32 219220132, [[TMP9]]
 ; UNROLL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 1
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
 ; UNROLL:       pred.udiv.continue6:
-; UNROLL-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF5]] ]
+; UNROLL-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF6]] ]
 ; UNROLL-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
-; UNROLL-NEXT:    br i1 [[TMP13]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP13]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; UNROLL:       pred.udiv.if7:
 ; UNROLL-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; UNROLL-NEXT:    [[TMP15:%.*]] = udiv i32 219220132, [[TMP14]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP15]], i64 2
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; UNROLL:       pred.udiv.continue8:
-; UNROLL-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP16]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP16]], [[PRED_UDIV_IF8]] ]
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
-; UNROLL-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
 ; UNROLL:       pred.udiv.if9:
 ; UNROLL-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; UNROLL-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
 ; UNROLL-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i64 3
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL:       pred.udiv.continue10:
-; UNROLL-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP21]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP21]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; UNROLL-NEXT:    br i1 [[TMP23]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP23]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
 ; UNROLL:       pred.udiv.if11:
 ; UNROLL-NEXT:    [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], -4
 ; UNROLL-NEXT:    [[TMP25:%.*]] = udiv i32 219220132, [[TMP24]]
 ; UNROLL-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> poison, i32 [[TMP25]], i64 0
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
 ; UNROLL:       pred.udiv.continue12:
-; UNROLL-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP26]], [[PRED_UDIV_IF11]] ]
+; UNROLL-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE11]] ], [ [[TMP26]], [[PRED_UDIV_IF12]] ]
 ; UNROLL-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; UNROLL-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
 ; UNROLL:       pred.udiv.if13:
 ; UNROLL-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NEXT:    [[TMP30:%.*]] = udiv i32 219220132, [[TMP29]]
 ; UNROLL-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP30]], i64 1
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
 ; UNROLL:       pred.udiv.continue14:
-; UNROLL-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ [[TMP27]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP31]], [[PRED_UDIV_IF13]] ]
+; UNROLL-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ [[TMP27]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP31]], [[PRED_UDIV_IF14]] ]
 ; UNROLL-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; UNROLL-NEXT:    br i1 [[TMP33]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP33]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
 ; UNROLL:       pred.udiv.if15:
 ; UNROLL-NEXT:    [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NEXT:    [[TMP35:%.*]] = udiv i32 219220132, [[TMP34]]
 ; UNROLL-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP35]], i64 2
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
 ; UNROLL:       pred.udiv.continue16:
-; UNROLL-NEXT:    [[TMP37:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP36]], [[PRED_UDIV_IF15]] ]
+; UNROLL-NEXT:    [[TMP37:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP36]], [[PRED_UDIV_IF16]] ]
 ; UNROLL-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; UNROLL-NEXT:    br i1 [[TMP38]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NEXT:    br i1 [[TMP38]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19]]
 ; UNROLL:       pred.udiv.if17:
 ; UNROLL-NEXT:    [[TMP39:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NEXT:    [[TMP40:%.*]] = udiv i32 219220132, [[TMP39]]
 ; UNROLL-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP40]], i64 3
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE19]]
 ; UNROLL:       pred.udiv.continue18:
-; UNROLL-NEXT:    [[TMP42]] = phi <4 x i32> [ [[TMP37]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP41]], [[PRED_UDIV_IF17]] ]
+; UNROLL-NEXT:    [[TMP42]] = phi <4 x i32> [ [[TMP37]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP41]], [[PRED_UDIV_IF18]] ]
 ; UNROLL-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP45]] = add <4 x i32> [[VEC_PHI]], [[TMP43]]
-; UNROLL-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI1]], [[TMP44]]
+; UNROLL-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI2]], [[TMP44]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; UNROLL-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI1]]
+; UNROLL-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI2]]
 ; UNROLL-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP48]], [[TMP49]]
 ; UNROLL-NEXT:    [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
@@ -4572,17 +4572,17 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE19:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE19]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3>
-; UNROLL-NO-IC-NEXT:    [[VEC_IV4:%.*]] = add <4 x i32> [[BROADCAST_SPLAT3]], <i32 4, i32 5, i32 6, i32 7>
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; UNROLL-NO-IC-NEXT:    [[VEC_IV5:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 4, i32 5, i32 6, i32 7>
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV4]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV5]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
@@ -4593,74 +4593,74 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC:       pred.udiv.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if5:
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
 ; UNROLL-NO-IC:       pred.udiv.continue6:
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF6]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP17]], [[PRED_UDIV_IF8]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL-NO-IC:       pred.udiv.continue10:
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP22]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if11:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
 ; UNROLL-NO-IC:       pred.udiv.continue12:
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE11]] ], [ [[TMP27]], [[PRED_UDIV_IF12]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if13:
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
 ; UNROLL-NO-IC:       pred.udiv.continue14:
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP32]], [[PRED_UDIV_IF14]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if15:
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]]
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
 ; UNROLL-NO-IC:       pred.udiv.continue16:
-; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP37]], [[PRED_UDIV_IF16]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19]]
 ; UNROLL-NO-IC:       pred.udiv.if17:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE19]]
 ; UNROLL-NO-IC:       pred.udiv.continue18:
-; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP42]], [[PRED_UDIV_IF18]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
-; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
+; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI2]], [[TMP45]]
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
+; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI2]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]]
@@ -4705,7 +4705,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE5:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_UDIV_CONTINUE5]] ]
 ; UNROLL-NO-VF-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[PRED_UDIV_CONTINUE5]] ]
-; UNROLL-NO-VF-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE5]] ]
+; UNROLL-NO-VF-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE5]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; UNROLL-NO-VF-NEXT:    [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-VF-NEXT:    [[VEC_IV3:%.*]] = add i32 [[INDEX]], 1
@@ -4720,15 +4720,15 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5]]
 ; UNROLL-NO-VF:       pred.udiv.if4:
-; UNROLL-NO-VF-NEXT:    [[INDUCTION2:%.*]] = add i32 [[OFFSET_IDX]], -1
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION2]]
+; UNROLL-NO-VF-NEXT:    [[INDUCTION1:%.*]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION1]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; UNROLL-NO-VF:       pred.udiv.continue5:
 ; UNROLL-NO-VF-NEXT:    [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF4]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
-; UNROLL-NO-VF-NEXT:    [[TMP9]] = add i32 [[VEC_PHI1]], [[TMP5]]
+; UNROLL-NO-VF-NEXT:    [[TMP9]] = add i32 [[VEC_PHI2]], [[TMP5]]
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]]
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI1]]
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI2]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51:![0-9]+]], !llvm.loop [[LOOP52:![0-9]+]]
@@ -4876,15 +4876,15 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE13:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[PRED_STORE_CONTINUE13]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[PRED_STORE_CONTINUE15]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], -3
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
@@ -4894,29 +4894,29 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; CHECK:       pred.udiv.continue:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3:%.*]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; CHECK:       pred.udiv.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv i32 219220132, [[TMP2]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP11]], i64 1
-; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE3]]
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; CHECK:       pred.udiv.continue3:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF2]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF4]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
 ; CHECK:       pred.udiv.if4:
 ; CHECK-NEXT:    [[TMP15:%.*]] = udiv i32 219220132, [[TMP3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP15]], i64 2
-; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
 ; CHECK:       pred.udiv.continue5:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE3]] ], [ [[TMP16]], [[PRED_UDIV_IF4]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP16]], [[PRED_UDIV_IF6]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; CHECK:       pred.udiv.if6:
 ; CHECK-NEXT:    [[TMP19:%.*]] = udiv i32 219220132, [[TMP4]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP19]], i64 3
-; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; CHECK:       pred.udiv.continue7:
-; CHECK-NEXT:    [[TMP21]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP20]], [[PRED_UDIV_IF6]] ]
+; CHECK-NEXT:    [[TMP21]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP20]], [[PRED_UDIV_IF8]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP21]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
@@ -4928,34 +4928,34 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
 ; CHECK:       pred.store.if8:
 ; CHECK-NEXT:    [[TMP28:%.*]] = or i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP30]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
 ; CHECK:       pred.store.continue9:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
 ; CHECK:       pred.store.if10:
 ; CHECK-NEXT:    [[TMP32:%.*]] = or i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP33]]
 ; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP34]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
 ; CHECK:       pred.store.continue11:
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
 ; CHECK:       pred.store.if12:
 ; CHECK-NEXT:    [[TMP36:%.*]] = or i32 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP38]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
 ; CHECK:       pred.store.continue13:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4984,12 +4984,12 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
-; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
+; UNROLL-NEXT:    [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT:    [[STEP_ADD4:%.*]] = add <4 x i32> [[VEC_IND3]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; UNROLL-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -2
@@ -4998,8 +4998,8 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NEXT:    [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], -7
-; UNROLL-NEXT:    [[TMP9:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; UNROLL-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; UNROLL-NEXT:    [[TMP9:%.*]] = icmp ule <4 x i32> [[VEC_IND3]], [[BROADCAST_SPLAT]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i32> [[STEP_ADD4]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0
 ; UNROLL-NEXT:    br i1 [[TMP11]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL:       pred.udiv.if:
@@ -5009,65 +5009,65 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL:       pred.udiv.continue:
 ; UNROLL-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1
-; UNROLL-NEXT:    br i1 [[TMP15]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP15]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; UNROLL:       pred.udiv.if4:
 ; UNROLL-NEXT:    [[TMP16:%.*]] = udiv i32 219220132, [[TMP2]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i64 1
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; UNROLL:       pred.udiv.continue5:
-; UNROLL-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF8]] ]
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2
-; UNROLL-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
 ; UNROLL:       pred.udiv.if6:
 ; UNROLL-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP3]]
 ; UNROLL-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i64 2
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL:       pred.udiv.continue7:
-; UNROLL-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP21]], [[PRED_UDIV_IF6]] ]
+; UNROLL-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP21]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3
-; UNROLL-NEXT:    br i1 [[TMP23]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP23]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
 ; UNROLL:       pred.udiv.if8:
 ; UNROLL-NEXT:    [[TMP24:%.*]] = udiv i32 219220132, [[TMP4]]
 ; UNROLL-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP24]], i64 3
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
 ; UNROLL:       pred.udiv.continue9:
-; UNROLL-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP25]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP25]], [[PRED_UDIV_IF12]] ]
 ; UNROLL-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0
-; UNROLL-NEXT:    br i1 [[TMP27]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP27]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
 ; UNROLL:       pred.udiv.if10:
 ; UNROLL-NEXT:    [[TMP28:%.*]] = udiv i32 219220132, [[TMP5]]
 ; UNROLL-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i64 0
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
 ; UNROLL:       pred.udiv.continue11:
-; UNROLL-NEXT:    [[TMP30:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP29]], [[PRED_UDIV_IF10]] ]
+; UNROLL-NEXT:    [[TMP30:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE13]] ], [ [[TMP29]], [[PRED_UDIV_IF14]] ]
 ; UNROLL-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1
-; UNROLL-NEXT:    br i1 [[TMP31]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP31]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
 ; UNROLL:       pred.udiv.if12:
 ; UNROLL-NEXT:    [[TMP32:%.*]] = udiv i32 219220132, [[TMP6]]
 ; UNROLL-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP32]], i64 1
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
 ; UNROLL:       pred.udiv.continue13:
-; UNROLL-NEXT:    [[TMP34:%.*]] = phi <4 x i32> [ [[TMP30]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP33]], [[PRED_UDIV_IF12]] ]
+; UNROLL-NEXT:    [[TMP34:%.*]] = phi <4 x i32> [ [[TMP30]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP33]], [[PRED_UDIV_IF16]] ]
 ; UNROLL-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2
-; UNROLL-NEXT:    br i1 [[TMP35]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP35]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19:%.*]]
 ; UNROLL:       pred.udiv.if14:
 ; UNROLL-NEXT:    [[TMP36:%.*]] = udiv i32 219220132, [[TMP7]]
 ; UNROLL-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP36]], i64 2
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE19]]
 ; UNROLL:       pred.udiv.continue15:
-; UNROLL-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP37]], [[PRED_UDIV_IF14]] ]
+; UNROLL-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP37]], [[PRED_UDIV_IF18]] ]
 ; UNROLL-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3
-; UNROLL-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF20:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
 ; UNROLL:       pred.udiv.if16:
 ; UNROLL-NEXT:    [[TMP40:%.*]] = udiv i32 219220132, [[TMP8]]
 ; UNROLL-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP40]], i64 3
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
 ; UNROLL:       pred.udiv.continue17:
-; UNROLL-NEXT:    [[TMP42]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP41]], [[PRED_UDIV_IF16]] ]
+; UNROLL-NEXT:    [[TMP42]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE19]] ], [ [[TMP41]], [[PRED_UDIV_IF20]] ]
 ; UNROLL-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP26]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NEXT:    [[TMP45]] = add <4 x i32> [[VEC_PHI]], [[TMP43]]
-; UNROLL-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI3]], [[TMP44]]
+; UNROLL-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI7]], [[TMP44]]
 ; UNROLL-NEXT:    [[TMP47:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0
 ; UNROLL-NEXT:    br i1 [[TMP47]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL:       pred.store.if:
@@ -5077,74 +5077,74 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL:       pred.store.continue:
 ; UNROLL-NEXT:    [[TMP50:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1
-; UNROLL-NEXT:    br i1 [[TMP50]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP50]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
 ; UNROLL:       pred.store.if18:
 ; UNROLL-NEXT:    [[TMP51:%.*]] = or i32 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP52:%.*]] = sext i32 [[TMP51]] to i64
 ; UNROLL-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP52]]
 ; UNROLL-NEXT:    store i32 [[TMP2]], i32* [[TMP53]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE23]]
 ; UNROLL:       pred.store.continue19:
 ; UNROLL-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2
-; UNROLL-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
 ; UNROLL:       pred.store.if20:
 ; UNROLL-NEXT:    [[TMP55:%.*]] = or i32 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP56:%.*]] = sext i32 [[TMP55]] to i64
 ; UNROLL-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP56]]
 ; UNROLL-NEXT:    store i32 [[TMP3]], i32* [[TMP57]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE25]]
 ; UNROLL:       pred.store.continue21:
 ; UNROLL-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3
-; UNROLL-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
 ; UNROLL:       pred.store.if22:
 ; UNROLL-NEXT:    [[TMP59:%.*]] = or i32 [[INDEX]], 3
 ; UNROLL-NEXT:    [[TMP60:%.*]] = sext i32 [[TMP59]] to i64
 ; UNROLL-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP60]]
 ; UNROLL-NEXT:    store i32 [[TMP4]], i32* [[TMP61]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE27]]
 ; UNROLL:       pred.store.continue23:
 ; UNROLL-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0
-; UNROLL-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
 ; UNROLL:       pred.store.if24:
 ; UNROLL-NEXT:    [[TMP63:%.*]] = or i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[TMP64:%.*]] = sext i32 [[TMP63]] to i64
 ; UNROLL-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP64]]
 ; UNROLL-NEXT:    store i32 [[TMP5]], i32* [[TMP65]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE29]]
 ; UNROLL:       pred.store.continue25:
 ; UNROLL-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1
-; UNROLL-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
 ; UNROLL:       pred.store.if26:
 ; UNROLL-NEXT:    [[TMP67:%.*]] = or i32 [[INDEX]], 5
 ; UNROLL-NEXT:    [[TMP68:%.*]] = sext i32 [[TMP67]] to i64
 ; UNROLL-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP68]]
 ; UNROLL-NEXT:    store i32 [[TMP6]], i32* [[TMP69]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE31]]
 ; UNROLL:       pred.store.continue27:
 ; UNROLL-NEXT:    [[TMP70:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2
-; UNROLL-NEXT:    br i1 [[TMP70]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; UNROLL-NEXT:    br i1 [[TMP70]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
 ; UNROLL:       pred.store.if28:
 ; UNROLL-NEXT:    [[TMP71:%.*]] = or i32 [[INDEX]], 6
 ; UNROLL-NEXT:    [[TMP72:%.*]] = sext i32 [[TMP71]] to i64
 ; UNROLL-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP72]]
 ; UNROLL-NEXT:    store i32 [[TMP7]], i32* [[TMP73]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE33]]
 ; UNROLL:       pred.store.continue29:
 ; UNROLL-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3
-; UNROLL-NEXT:    br i1 [[TMP74]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NEXT:    br i1 [[TMP74]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
 ; UNROLL:       pred.store.if30:
 ; UNROLL-NEXT:    [[TMP75:%.*]] = or i32 [[INDEX]], 7
 ; UNROLL-NEXT:    [[TMP76:%.*]] = sext i32 [[TMP75]] to i64
 ; UNROLL-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP76]]
 ; UNROLL-NEXT:    store i32 [[TMP8]], i32* [[TMP77]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE35]]
 ; UNROLL:       pred.store.continue31:
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; UNROLL-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i32> [[VEC_IND3]], <i32 8, i32 8, i32 8, i32 8>
 ; UNROLL-NEXT:    [[TMP78:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP78]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    [[TMP79:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI3]]
+; UNROLL-NEXT:    [[TMP79:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI7]]
 ; UNROLL-NEXT:    [[TMP80:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP79]], [[TMP80]]
 ; UNROLL-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
@@ -5173,12 +5173,12 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD4:%.*]] = add <4 x i32> [[VEC_IND3]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
@@ -5188,8 +5188,8 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], -7
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = icmp ule <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i32> [[VEC_IND3]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = icmp ule <4 x i32> [[STEP_ADD4]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
@@ -5199,65 +5199,65 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NO-IC:       pred.udiv.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if4:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; UNROLL-NO-IC:       pred.udiv.continue5:
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF8]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP20]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if6:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL-NO-IC:       pred.udiv.continue7:
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP22]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if8:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
 ; UNROLL-NO-IC:       pred.udiv.continue9:
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP26]], [[PRED_UDIV_IF12]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if10:
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
 ; UNROLL-NO-IC:       pred.udiv.continue11:
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE13]] ], [ [[TMP30]], [[PRED_UDIV_IF14]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if12:
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
 ; UNROLL-NO-IC:       pred.udiv.continue13:
-; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP34]], [[PRED_UDIV_IF16]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP36]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if14:
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE19]]
 ; UNROLL-NO-IC:       pred.udiv.continue15:
-; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP38]], [[PRED_UDIV_IF18]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP40]], label [[PRED_UDIV_IF20:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if16:
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
 ; UNROLL-NO-IC:       pred.udiv.continue17:
-; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE19]] ], [ [[TMP42]], [[PRED_UDIV_IF20]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
-; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI3]], [[TMP45]]
+; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI7]], [[TMP45]]
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP48]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.store.if:
@@ -5267,65 +5267,65 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NO-IC:       pred.store.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
 ; UNROLL-NO-IC:       pred.store.if18:
 ; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP52]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], i32* [[TMP53]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE23]]
 ; UNROLL-NO-IC:       pred.store.continue19:
 ; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
 ; UNROLL-NO-IC:       pred.store.if20:
 ; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP55]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP4]], i32* [[TMP56]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE25]]
 ; UNROLL-NO-IC:       pred.store.continue21:
 ; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
 ; UNROLL-NO-IC:       pred.store.if22:
 ; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = add i32 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP58]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP5]], i32* [[TMP59]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE27]]
 ; UNROLL-NO-IC:       pred.store.continue23:
 ; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
 ; UNROLL-NO-IC:       pred.store.if24:
 ; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = add i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP61]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP6]], i32* [[TMP62]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE29]]
 ; UNROLL-NO-IC:       pred.store.continue25:
 ; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
 ; UNROLL-NO-IC:       pred.store.if26:
 ; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = add i32 [[INDEX]], 5
 ; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP64]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP7]], i32* [[TMP65]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE31]]
 ; UNROLL-NO-IC:       pred.store.continue27:
 ; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
 ; UNROLL-NO-IC:       pred.store.if28:
 ; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = add i32 [[INDEX]], 6
 ; UNROLL-NO-IC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP67]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP8]], i32* [[TMP68]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE33]]
 ; UNROLL-NO-IC:       pred.store.continue29:
 ; UNROLL-NO-IC-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
 ; UNROLL-NO-IC:       pred.store.if30:
 ; UNROLL-NO-IC-NEXT:    [[TMP70:%.*]] = add i32 [[INDEX]], 7
 ; UNROLL-NO-IC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP70]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP9]], i32* [[TMP71]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE35]]
 ; UNROLL-NO-IC:       pred.store.continue31:
 ; UNROLL-NO-IC-NEXT:    [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT:    [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]]
+; UNROLL-NO-IC-NEXT:    [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI7]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i32> [[STEP_ADD4]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -5374,44 +5374,44 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_STORE_CONTINUE10]] ]
 ; UNROLL-NO-VF-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[PRED_STORE_CONTINUE10]] ]
-; UNROLL-NO-VF-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE10]] ]
+; UNROLL-NO-VF-NEXT:    [[VEC_PHI5:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE10]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
-; UNROLL-NO-VF-NEXT:    [[INDUCTION4:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION5:%.*]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0
+; UNROLL-NO-VF-NEXT:    [[INDUCTION2:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-VF-NEXT:    [[VEC_IV6:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[VEC_IV6]], [[TRIP_COUNT_MINUS_1]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-VF:       pred.udiv.if:
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = udiv i32 219220132, [[INDUCTION4]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = udiv i32 219220132, [[INDUCTION]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
 ; UNROLL-NO-VF:       pred.udiv.if7:
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION5]]
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION2]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-VF:       pred.udiv.continue8:
 ; UNROLL-NO-VF-NEXT:    [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
-; UNROLL-NO-VF-NEXT:    [[TMP9]] = add i32 [[VEC_PHI2]], [[TMP5]]
+; UNROLL-NO-VF-NEXT:    [[TMP9]] = add i32 [[VEC_PHI5]], [[TMP5]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NO-VF:       pred.store.if:
-; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    store i32 [[INDUCTION4]], i32* [[TMP10]], align 4
+; UNROLL-NO-VF-NEXT:    [[INDUCTION3:%.*]] = add i32 [[INDEX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDUCTION3]]
+; UNROLL-NO-VF-NEXT:    store i32 [[INDUCTION]], i32* [[TMP10]], align 4
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NO-VF:       pred.store.continue:
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
 ; UNROLL-NO-VF:       pred.store.if9:
-; UNROLL-NO-VF-NEXT:    [[INDUCTION3:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[INDUCTION3]]
-; UNROLL-NO-VF-NEXT:    store i32 [[INDUCTION5]], i32* [[TMP11]], align 4
+; UNROLL-NO-VF-NEXT:    [[INDUCTION4:%.*]] = add i32 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[INDUCTION4]]
+; UNROLL-NO-VF-NEXT:    store i32 [[INDUCTION2]], i32* [[TMP11]], align 4
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; UNROLL-NO-VF:       pred.store.continue10:
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]]
-; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI2]]
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI5]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51]], !llvm.loop [[LOOP55:![0-9]+]]
@@ -5457,16 +5457,16 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE13:%.*]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; SINK-AFTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_STORE_CONTINUE13]] ]
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; SINK-AFTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_STORE_CONTINUE15]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], -3
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp ule <4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
 ; SINK-AFTER-NEXT:    br i1 [[TMP7]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; SINK-AFTER:       pred.udiv.if:
@@ -5476,29 +5476,29 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; SINK-AFTER:       pred.udiv.continue:
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ]
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; SINK-AFTER-NEXT:    br i1 [[TMP11]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP11]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; SINK-AFTER:       pred.udiv.if2:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = udiv i32 219220132, [[TMP3]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP12]], i32 1
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE3]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; SINK-AFTER:       pred.udiv.continue3:
-; SINK-AFTER-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF2]] ]
+; SINK-AFTER-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF4]] ]
 ; SINK-AFTER-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; SINK-AFTER-NEXT:    br i1 [[TMP15]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP15]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
 ; SINK-AFTER:       pred.udiv.if4:
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = udiv i32 219220132, [[TMP4]]
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i32 2
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
 ; SINK-AFTER:       pred.udiv.continue5:
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE3]] ], [ [[TMP17]], [[PRED_UDIV_IF4]] ]
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP17]], [[PRED_UDIV_IF6]] ]
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; SINK-AFTER:       pred.udiv.if6:
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP5]]
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; SINK-AFTER:       pred.udiv.continue7:
-; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP21]], [[PRED_UDIV_IF6]] ]
+; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP21]], [[PRED_UDIV_IF8]] ]
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
@@ -5510,32 +5510,32 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; SINK-AFTER:       pred.store.continue:
 ; SINK-AFTER-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; SINK-AFTER-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
 ; SINK-AFTER:       pred.store.if8:
 ; SINK-AFTER-NEXT:    [[TMP29:%.*]] = add i32 [[INDEX]], 1
 ; SINK-AFTER-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP29]]
 ; SINK-AFTER-NEXT:    store i32 [[TMP3]], i32* [[TMP30]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE11]]
 ; SINK-AFTER:       pred.store.continue9:
 ; SINK-AFTER-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
 ; SINK-AFTER:       pred.store.if10:
 ; SINK-AFTER-NEXT:    [[TMP32:%.*]] = add i32 [[INDEX]], 2
 ; SINK-AFTER-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP32]]
 ; SINK-AFTER-NEXT:    store i32 [[TMP4]], i32* [[TMP33]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE13]]
 ; SINK-AFTER:       pred.store.continue11:
 ; SINK-AFTER-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13]]
+; SINK-AFTER-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
 ; SINK-AFTER:       pred.store.if12:
 ; SINK-AFTER-NEXT:    [[TMP35:%.*]] = add i32 [[INDEX]], 3
 ; SINK-AFTER-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP35]]
 ; SINK-AFTER-NEXT:    store i32 [[TMP5]], i32* [[TMP36]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE15]]
 ; SINK-AFTER:       pred.store.continue13:
 ; SINK-AFTER-NEXT:    [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
 ; SINK-AFTER-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
 ; SINK-AFTER:       middle.block:
@@ -5652,35 +5652,35 @@ define void @sink_after_dead_inst(i32* %A.ptr) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 4
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[STEP_ADD]], <i16 1, i16 1, i16 1, i16 1>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = or <4 x i16> [[TMP2]], [[TMP2]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = or <4 x i16> [[TMP3]], [[TMP3]]
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP7]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP10]], i32 4
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP15]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 4
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], <i16 1, i16 1, i16 1, i16 1>
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP9]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP13]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP12]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[TMP4]]
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP27]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i32 4
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP29]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 16, 16
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -5720,13 +5720,13 @@ define void @sink_after_dead_inst(i32* %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = or i16 [[TMP1]], [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP5]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[INDUCTION1]]
-; UNROLL-NO-VF-NEXT:    store i32 0, i32* [[TMP6]], align 4
-; UNROLL-NO-VF-NEXT:    store i32 0, i32* [[TMP7]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[INDUCTION]]
+; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[INDUCTION1]]
+; UNROLL-NO-VF-NEXT:    store i32 0, i32* [[TMP14]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 0, i32* [[TMP15]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 16, 16
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -5759,25 +5759,25 @@ define void @sink_after_dead_inst(i32* %A.ptr) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP7]], align 4
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = or <4 x i16> [[TMP4]], [[TMP4]]
+; SINK-AFTER-NEXT:    [[TMP6]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
+; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 0
+; SINK-AFTER-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP14]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
-; SINK-AFTER-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; SINK-AFTER-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; SINK-AFTER-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 16, 16
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -5799,7 +5799,6 @@ define void @sink_after_dead_inst(i32* %A.ptr) {
 ; SINK-AFTER-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP59:![0-9]+]]
 ; SINK-AFTER:       for.end:
 ; SINK-AFTER-NEXT:    ret void
-;
 entry:
   br label %loop
 

diff  --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 8d3867da88a03..edb9ca80edc0e 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1470,7 +1470,7 @@ define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
-; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
 ; VEC4_INTERL2-NEXT:    [[TMP0:%.*]] = sitofp i64 [[INDEX]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 4
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
@@ -1478,9 +1478,9 @@ define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 4
 ; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; VEC4_INTERL2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
+; VEC4_INTERL2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer
-; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD2]], zeroinitializer
+; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD3]], zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0
 ; VEC4_INTERL2-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; VEC4_INTERL2:       pred.store.if:
@@ -1489,66 +1489,66 @@ define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VEC4_INTERL2:       pred.store.continue:
 ; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1
-; VEC4_INTERL2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
 ; VEC4_INTERL2:       pred.store.if3:
 ; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = fadd fast float [[TMP0]], 1.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = or i64 [[INDEX]], 1
 ; VEC4_INTERL2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP11]], float* [[TMP13]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE5]]
 ; VEC4_INTERL2:       pred.store.continue4:
 ; VEC4_INTERL2-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2
-; VEC4_INTERL2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
 ; VEC4_INTERL2:       pred.store.if5:
 ; VEC4_INTERL2-NEXT:    [[TMP15:%.*]] = fadd fast float [[TMP0]], 2.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP16:%.*]] = or i64 [[INDEX]], 2
 ; VEC4_INTERL2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE7]]
 ; VEC4_INTERL2:       pred.store.continue6:
 ; VEC4_INTERL2-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3
-; VEC4_INTERL2-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
 ; VEC4_INTERL2:       pred.store.if7:
 ; VEC4_INTERL2-NEXT:    [[TMP19:%.*]] = fadd fast float [[TMP0]], 3.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP20:%.*]] = or i64 [[INDEX]], 3
 ; VEC4_INTERL2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
 ; VEC4_INTERL2:       pred.store.continue8:
 ; VEC4_INTERL2-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP7]], i64 0
-; VEC4_INTERL2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
 ; VEC4_INTERL2:       pred.store.if9:
 ; VEC4_INTERL2-NEXT:    [[TMP23:%.*]] = fadd fast float [[TMP0]], 4.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP23]], float* [[TMP24]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE11]]
 ; VEC4_INTERL2:       pred.store.continue10:
 ; VEC4_INTERL2-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP7]], i64 1
-; VEC4_INTERL2-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
 ; VEC4_INTERL2:       pred.store.if11:
 ; VEC4_INTERL2-NEXT:    [[TMP26:%.*]] = fadd fast float [[TMP0]], 5.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 5
 ; VEC4_INTERL2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP27]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP26]], float* [[TMP28]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE13]]
 ; VEC4_INTERL2:       pred.store.continue12:
 ; VEC4_INTERL2-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP7]], i64 2
-; VEC4_INTERL2-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
 ; VEC4_INTERL2:       pred.store.if13:
 ; VEC4_INTERL2-NEXT:    [[TMP30:%.*]] = fadd fast float [[TMP0]], 6.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 6
 ; VEC4_INTERL2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP30]], float* [[TMP32]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE15]]
 ; VEC4_INTERL2:       pred.store.continue14:
 ; VEC4_INTERL2-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP7]], i64 3
-; VEC4_INTERL2-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]]
+; VEC4_INTERL2-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17]]
 ; VEC4_INTERL2:       pred.store.if15:
 ; VEC4_INTERL2-NEXT:    [[TMP34:%.*]] = fadd fast float [[TMP0]], 7.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP35:%.*]] = or i64 [[INDEX]], 7
 ; VEC4_INTERL2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP35]]
 ; VEC4_INTERL2-NEXT:    store float [[TMP34]], float* [[TMP36]], align 4
-; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; VEC4_INTERL2-NEXT:    br label [[PRED_STORE_CONTINUE17]]
 ; VEC4_INTERL2:       pred.store.continue16:
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

diff  --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
index 61c13015450cd..fd6fe8ac403fc 100644
--- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
@@ -66,7 +66,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 65536, 65536
@@ -84,7 +84,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
 ; CHECK-NEXT:    [[T7]] = add i64 [[T1]], 1
 ; CHECK-NEXT:    [[T8:%.*]] = icmp eq i64 [[T7]], 65537
-; CHECK-NEXT:    br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[T8]], label [[OUT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       out:
 ; CHECK-NEXT:    [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[T6_LCSSA]]

diff  --git a/llvm/test/Transforms/LoopVectorize/fneg.ll b/llvm/test/Transforms/LoopVectorize/fneg.ll
index af67ba950bb7f..103e795b2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/fneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/fneg.ll
@@ -1,45 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
 define void @foo(float* %a, i64 %n) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK:         [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* {{.*}}, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fneg float [[TMP7]]
-; CHECK-NEXT:    store float [[SUB]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.exit:
-; CHECK-NEXT:    ret void
+; CHECK:         store <4 x float> [[TMP4]], <4 x float>* {{.*}}, align 4
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
index 04c824b7d104e..a9e8b755f182a 100644
--- a/llvm/test/Transforms/LoopVectorize/i8-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S
 ; RUN: opt < %s -debugify -loop-vectorize -S | FileCheck %s --check-prefix=DEBUGLOC
 
@@ -10,62 +9,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define void @f() nounwind uwtable ssp {
 ; Check that the induction phis and adds have debug location.
 ;
-; DEBUGLOC-LABEL: @f(
-; DEBUGLOC-NEXT:  scalar.ph:
-; DEBUGLOC-NEXT:    store i8 0, i8* inttoptr (i64 1 to i8*), align 1, !dbg [[DBG22:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP0:%.*]] = load i8, i8* @a, align 1, !dbg [[DBG23:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[TMP0]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; DEBUGLOC-NEXT:    br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG24:![0-9]+]]
-; DEBUGLOC:       vector.ph:
-; DEBUGLOC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i32 0, !dbg [[DBG24]]
-; DEBUGLOC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer, !dbg [[DBG24]]
-; DEBUGLOC-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG24]]
-; DEBUGLOC:       vector.body:
-; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEBUGLOC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 1, i8 1>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; DEBUGLOC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ undef, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG25:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i8> [[VEC_IND]], zeroinitializer, !dbg [[DBG26:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[VEC_IND]], <4 x i8> [[BROADCAST_SPLAT]], !dbg [[DBG27:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP3]] = mul <4 x i8> [[VEC_PHI]], [[TMP2]], !dbg [[DBG28:![0-9]+]]
-; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>, !dbg [[DBG25]]
-; DEBUGLOC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; DEBUGLOC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; DEBUGLOC:       middle.block:
-; DEBUGLOC-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> [[TMP3]]), !dbg [[DBG31:![0-9]+]]
-; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 16, 16, !dbg [[DBG31]]
-; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH1]], !dbg [[DBG31]]
-; DEBUGLOC:       scalar.ph1:
-; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ undef, [[MIDDLE_BLOCK]] ], [ undef, [[SCALAR_PH:%.*]] ], !dbg [[DBG25]]
-; DEBUGLOC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; DEBUGLOC-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG24]]
-; DEBUGLOC:       for.body:
-; DEBUGLOC-NEXT:    [[MUL16:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH1]] ], [ [[MUL:%.*]], [[FOR_BODY]] ], !dbg [[DBG32:![0-9]+]]
-; DEBUGLOC-NEXT:    [[C_015:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH1]] ], [ [[CONV8:%.*]], [[FOR_BODY]] ], !dbg [[DBG25]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[MUL16]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[C_015]], metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25]]
-; DEBUGLOC-NEXT:    [[CONV2:%.*]] = sext i8 [[C_015]] to i32, !dbg [[DBG33:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32 [[CONV2]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33]]
-; DEBUGLOC-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[C_015]], 0, !dbg [[DBG26]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i1 [[TOBOOL]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26]]
-; DEBUGLOC-NEXT:    [[DOTSINK:%.*]] = select i1 [[TOBOOL]], i8 [[C_015]], i8 [[TMP0]], !dbg [[DBG27]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[DOTSINK]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27]]
-; DEBUGLOC-NEXT:    [[MUL]] = mul i8 [[MUL16]], [[DOTSINK]], !dbg [[DBG28]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[MUL]], metadata [[META17:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28]]
-; DEBUGLOC-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV2]], 1, !dbg [[DBG34:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32 [[ADD]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34]]
-; DEBUGLOC-NEXT:    [[CONV8]] = trunc i32 [[ADD]] to i8, !dbg [[DBG35:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[CONV8]], metadata [[META19:![0-9]+]], metadata !DIExpression()), !dbg [[DBG35]]
-; DEBUGLOC-NEXT:    [[SEXT:%.*]] = shl i32 [[ADD]], 24, !dbg [[DBG36:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32 [[SEXT]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG36]]
-; DEBUGLOC-NEXT:    [[PHITMP14:%.*]] = icmp slt i32 [[SEXT]], 268435456, !dbg [[DBG37:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i1 [[PHITMP14]], metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG37]]
-; DEBUGLOC-NEXT:    br i1 [[PHITMP14]], label [[FOR_BODY]], label [[FOR_END]], !dbg [[DBG31]], !llvm.loop [[LOOP38:![0-9]+]]
-; DEBUGLOC:       for.end:
-; DEBUGLOC-NEXT:    [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], !dbg [[DBG28]]
-; DEBUGLOC-NEXT:    store i8 [[MUL_LCSSA]], i8* @b, align 1, !dbg [[DBG40:![0-9]+]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG41:![0-9]+]]
-;
+; DEBUGLOC-LABEL: vector.body:
+; DEBUGLOC:         %vec.ind = phi {{.*}}, !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC:         %vec.ind.next = add {{.*}}, !dbg ![[DbgLoc]]
 
 scalar.ph:
   store i8 0, i8* inttoptr (i64 1 to i8*), align 1
@@ -91,3 +37,4 @@ for.end:                                          ; preds = %for.body
 }
 
 ; Check that the location of the new phi comes from %c.015 = phi i8
+; DEBUGLOC:         ![[DbgLoc]] = !DILocation(line: 5

diff  --git a/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll b/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll
index d314e4189141e..d82779a30e880 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -18,8 +17,8 @@ while.body.i.preheader:                           ; preds = %if.else
 
 while.body.i:                                     ; preds = %if.end.i, %while.body.i.preheader
   switch i8 undef, label %if.end.i [
-  i8 39, label %if.then.i
-  i8 92, label %if.then.i
+    i8 39, label %if.then.i
+    i8 92, label %if.then.i
   ]
 
 if.then.i:                                        ; preds = %while.body.i, %while.body.i

diff  --git a/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
index bc28355dd09e5..83386a0d688ff 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -15,198 +14,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NOT: %6000000 =
 
 define void @_Z3fn4i(i32 %p1) {
-; CHECK-LABEL: @_Z3fn4i(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP88:%.*]] = icmp sgt i32 [[P1:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP88]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** @b, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** @a, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** @c, align 8
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[_ZL3FN3II_EXIT58:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[TMP4]], 1
-; CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp eq i32 [[AND_I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]]
-; CHECK:       if.then.i:
-; CHECK-NEXT:    [[AND_I_I:%.*]] = lshr i32 [[TMP3]], 2
-; CHECK-NEXT:    [[AND_LOBIT_I_I:%.*]] = and i32 [[AND_I_I]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[AND_LOBIT_I_I]], 1
-; CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[OR_I_I]], 0
-; CHECK-NEXT:    [[CONV_I:%.*]] = zext i1 [[CMP_I]] to i32
-; CHECK-NEXT:    br label [[IF_END_I]]
-; CHECK:       if.end.i:
-; CHECK-NEXT:    [[TOBOOL_I87:%.*]] = phi i1 [ true, [[IF_THEN_I]] ], [ false, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[P1_ADDR_0_I:%.*]] = phi i32 [ [[CONV_I]], [[IF_THEN_I]] ], [ [[TMP3]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[AND1_I:%.*]] = and i32 [[TMP6]], 7
-; CHECK-NEXT:    [[TOBOOL2_I:%.*]] = icmp eq i32 [[AND1_I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL2_I]], label [[IF_END7_I:%.*]], label [[IF_THEN3_I:%.*]]
-; CHECK:       if.then3.i:
-; CHECK-NEXT:    [[P1_ADDR_0_LOBIT_I:%.*]] = lshr i32 [[P1_ADDR_0_I]], 31
-; CHECK-NEXT:    [[AND6_I:%.*]] = and i32 [[P1_ADDR_0_I]], 1
-; CHECK-NEXT:    [[OR_I:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I]], [[AND6_I]]
-; CHECK-NEXT:    br label [[IF_END7_I]]
-; CHECK:       if.end7.i:
-; CHECK-NEXT:    [[P1_ADDR_1_I:%.*]] = phi i32 [ [[OR_I]], [[IF_THEN3_I]] ], [ [[P1_ADDR_0_I]], [[IF_END_I]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I87]], label [[IF_THEN10_I:%.*]], label [[IF_END13_I:%.*]]
-; CHECK:       if.then10.i:
-; CHECK-NEXT:    [[CMP11_I:%.*]] = icmp sgt i32 [[P1_ADDR_1_I]], 0
-; CHECK-NEXT:    [[CONV12_I:%.*]] = zext i1 [[CMP11_I]] to i32
-; CHECK-NEXT:    br label [[IF_END13_I]]
-; CHECK:       if.end13.i:
-; CHECK-NEXT:    [[P1_ADDR_2_I:%.*]] = phi i32 [ [[CONV12_I]], [[IF_THEN10_I]] ], [ [[P1_ADDR_1_I]], [[IF_END7_I]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label [[_Z3FN2III_EXIT:%.*]], label [[IF_THEN16_I:%.*]]
-; CHECK:       if.then16.i:
-; CHECK-NEXT:    [[AND17_I:%.*]] = lshr i32 [[P1_ADDR_2_I]], 3
-; CHECK-NEXT:    [[AND17_LOBIT_I:%.*]] = and i32 [[AND17_I]], 1
-; CHECK-NEXT:    br label [[_Z3FN2III_EXIT]]
-; CHECK:       _Z3fn2iii.exit:
-; CHECK-NEXT:    [[P1_ADDR_3_I:%.*]] = phi i32 [ [[AND17_LOBIT_I]], [[IF_THEN16_I]] ], [ [[P1_ADDR_2_I]], [[IF_END13_I]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[TMP7]], 1
-; CHECK-NEXT:    [[AND_I18_I:%.*]] = and i32 [[SHR_I]], 1
-; CHECK-NEXT:    [[TOBOOL_I19_I:%.*]] = icmp ne i32 [[AND_I18_I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I19_I]], label [[IF_THEN_I20_I:%.*]], label [[IF_END_I_I:%.*]]
-; CHECK:       if.then.i20.i:
-; CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp sgt i32 [[P1_ADDR_3_I]], 0
-; CHECK-NEXT:    [[CONV_I_I:%.*]] = zext i1 [[CMP_I_I]] to i32
-; CHECK-NEXT:    br label [[IF_END_I_I]]
-; CHECK:       if.end.i.i:
-; CHECK-NEXT:    [[P1_ADDR_0_I21_I:%.*]] = phi i32 [ [[CONV_I_I]], [[IF_THEN_I20_I]] ], [ [[P1_ADDR_3_I]], [[_Z3FN2III_EXIT]] ]
-; CHECK-NEXT:    [[AND1_I_I:%.*]] = and i32 [[SHR_I]], 7
-; CHECK-NEXT:    [[TOBOOL2_I_I:%.*]] = icmp eq i32 [[AND1_I_I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL2_I_I]], label [[IF_END7_I_I:%.*]], label [[IF_THEN3_I_I:%.*]]
-; CHECK:       if.then3.i.i:
-; CHECK-NEXT:    [[P1_ADDR_0_LOBIT_I_I:%.*]] = lshr i32 [[P1_ADDR_0_I21_I]], 31
-; CHECK-NEXT:    [[AND6_I_I:%.*]] = and i32 [[P1_ADDR_0_I21_I]], 1
-; CHECK-NEXT:    [[OR_I22_I:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I_I]], [[AND6_I_I]]
-; CHECK-NEXT:    br label [[IF_END7_I_I]]
-; CHECK:       if.end7.i.i:
-; CHECK-NEXT:    [[P1_ADDR_1_I_I:%.*]] = phi i32 [ [[OR_I22_I]], [[IF_THEN3_I_I]] ], [ [[P1_ADDR_0_I21_I]], [[IF_END_I_I]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I19_I]], label [[IF_THEN10_I_I:%.*]], label [[IF_END13_I_I:%.*]]
-; CHECK:       if.then10.i.i:
-; CHECK-NEXT:    [[CMP11_I_I:%.*]] = icmp sgt i32 [[P1_ADDR_1_I_I]], 0
-; CHECK-NEXT:    [[CONV12_I_I:%.*]] = zext i1 [[CMP11_I_I]] to i32
-; CHECK-NEXT:    br label [[IF_END13_I_I]]
-; CHECK:       if.end13.i.i:
-; CHECK-NEXT:    [[P1_ADDR_2_I_I:%.*]] = phi i32 [ [[CONV12_I_I]], [[IF_THEN10_I_I]] ], [ [[P1_ADDR_1_I_I]], [[IF_END7_I_I]] ]
-; CHECK-NEXT:    [[AND14_I_I:%.*]] = and i32 [[SHR_I]], 5
-; CHECK-NEXT:    [[TOBOOL15_I_I:%.*]] = icmp eq i32 [[AND14_I_I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL15_I_I]], label [[_Z3FN2III_EXIT_I:%.*]], label [[IF_THEN16_I_I:%.*]]
-; CHECK:       if.then16.i.i:
-; CHECK-NEXT:    [[AND17_I_I:%.*]] = lshr i32 [[P1_ADDR_2_I_I]], 3
-; CHECK-NEXT:    [[AND17_LOBIT_I_I:%.*]] = and i32 [[AND17_I_I]], 1
-; CHECK-NEXT:    br label [[_Z3FN2III_EXIT_I]]
-; CHECK:       _Z3fn2iii.exit.i:
-; CHECK-NEXT:    [[P1_ADDR_3_I_I:%.*]] = phi i32 [ [[AND17_LOBIT_I_I]], [[IF_THEN16_I_I]] ], [ [[P1_ADDR_2_I_I]], [[IF_END13_I_I]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TOBOOL_I11_I:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I11_I]], label [[_ZL3FN3II_EXIT:%.*]], label [[IF_THEN_I15_I:%.*]]
-; CHECK:       if.then.i15.i:
-; CHECK-NEXT:    [[AND_I12_I:%.*]] = lshr i32 [[P1_ADDR_3_I_I]], 2
-; CHECK-NEXT:    [[AND_LOBIT_I13_I:%.*]] = and i32 [[AND_I12_I]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[AND_LOBIT_I13_I]], 1
-; CHECK-NEXT:    [[OR_I14_I:%.*]] = or i32 [[TMP9]], [[P1_ADDR_3_I_I]]
-; CHECK-NEXT:    br label [[_ZL3FN3II_EXIT]]
-; CHECK:       _ZL3fn3ii.exit:
-; CHECK-NEXT:    [[P1_ADDR_0_I16_I:%.*]] = phi i32 [ [[OR_I14_I]], [[IF_THEN_I15_I]] ], [ [[P1_ADDR_3_I_I]], [[_Z3FN2III_EXIT_I]] ]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[P1_ADDR_0_I16_I]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label [[_Z3FN1II_EXIT_I26:%.*]], label [[IF_THEN_I_I21:%.*]]
-; CHECK:       if.then.i.i21:
-; CHECK-NEXT:    [[AND_I_I18:%.*]] = lshr i32 [[TMP10]], 2
-; CHECK-NEXT:    [[AND_LOBIT_I_I19:%.*]] = and i32 [[AND_I_I18]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = xor i32 [[AND_LOBIT_I_I19]], 1
-; CHECK-NEXT:    [[OR_I_I20:%.*]] = or i32 [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    br label [[_Z3FN1II_EXIT_I26]]
-; CHECK:       _Z3fn1ii.exit.i26:
-; CHECK-NEXT:    [[P1_ADDR_0_I_I22:%.*]] = phi i32 [ [[OR_I_I20]], [[IF_THEN_I_I21]] ], [ [[TMP10]], [[_ZL3FN3II_EXIT]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I87]], label [[IF_THEN_I63:%.*]], label [[IF_END_I67:%.*]]
-; CHECK:       if.then.i63:
-; CHECK-NEXT:    [[CMP_I61:%.*]] = icmp sgt i32 [[P1_ADDR_0_I_I22]], 0
-; CHECK-NEXT:    [[CONV_I62:%.*]] = zext i1 [[CMP_I61]] to i32
-; CHECK-NEXT:    br label [[IF_END_I67]]
-; CHECK:       if.end.i67:
-; CHECK-NEXT:    [[P1_ADDR_0_I64:%.*]] = phi i32 [ [[CONV_I62]], [[IF_THEN_I63]] ], [ [[P1_ADDR_0_I_I22]], [[_Z3FN1II_EXIT_I26]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL2_I]], label [[IF_END7_I73:%.*]], label [[IF_THEN3_I71:%.*]]
-; CHECK:       if.then3.i71:
-; CHECK-NEXT:    [[P1_ADDR_0_LOBIT_I68:%.*]] = lshr i32 [[P1_ADDR_0_I64]], 31
-; CHECK-NEXT:    [[AND6_I69:%.*]] = and i32 [[P1_ADDR_0_I64]], 1
-; CHECK-NEXT:    [[OR_I70:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I68]], [[AND6_I69]]
-; CHECK-NEXT:    br label [[IF_END7_I73]]
-; CHECK:       if.end7.i73:
-; CHECK-NEXT:    [[P1_ADDR_1_I72:%.*]] = phi i32 [ [[OR_I70]], [[IF_THEN3_I71]] ], [ [[P1_ADDR_0_I64]], [[IF_END_I67]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I87]], label [[IF_THEN10_I76:%.*]], label [[IF_END13_I80:%.*]]
-; CHECK:       if.then10.i76:
-; CHECK-NEXT:    [[CMP11_I74:%.*]] = icmp sgt i32 [[P1_ADDR_1_I72]], 0
-; CHECK-NEXT:    [[CONV12_I75:%.*]] = zext i1 [[CMP11_I74]] to i32
-; CHECK-NEXT:    br label [[IF_END13_I80]]
-; CHECK:       if.end13.i80:
-; CHECK-NEXT:    [[P1_ADDR_2_I77:%.*]] = phi i32 [ [[CONV12_I75]], [[IF_THEN10_I76]] ], [ [[P1_ADDR_1_I72]], [[IF_END7_I73]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label [[_Z3FN2III_EXIT85:%.*]], label [[IF_THEN16_I83:%.*]]
-; CHECK:       if.then16.i83:
-; CHECK-NEXT:    [[AND17_I81:%.*]] = lshr i32 [[P1_ADDR_2_I77]], 3
-; CHECK-NEXT:    [[AND17_LOBIT_I82:%.*]] = and i32 [[AND17_I81]], 1
-; CHECK-NEXT:    br label [[_Z3FN2III_EXIT85]]
-; CHECK:       _Z3fn2iii.exit85:
-; CHECK-NEXT:    [[P1_ADDR_3_I84:%.*]] = phi i32 [ [[AND17_LOBIT_I82]], [[IF_THEN16_I83]] ], [ [[P1_ADDR_2_I77]], [[IF_END13_I80]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I19_I]], label [[IF_THEN_I20_I29:%.*]], label [[IF_END_I_I33:%.*]]
-; CHECK:       if.then.i20.i29:
-; CHECK-NEXT:    [[CMP_I_I27:%.*]] = icmp sgt i32 [[P1_ADDR_3_I84]], 0
-; CHECK-NEXT:    [[CONV_I_I28:%.*]] = zext i1 [[CMP_I_I27]] to i32
-; CHECK-NEXT:    br label [[IF_END_I_I33]]
-; CHECK:       if.end.i.i33:
-; CHECK-NEXT:    [[P1_ADDR_0_I21_I30:%.*]] = phi i32 [ [[CONV_I_I28]], [[IF_THEN_I20_I29]] ], [ [[P1_ADDR_3_I84]], [[_Z3FN2III_EXIT85]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL2_I_I]], label [[IF_END7_I_I39:%.*]], label [[IF_THEN3_I_I37:%.*]]
-; CHECK:       if.then3.i.i37:
-; CHECK-NEXT:    [[P1_ADDR_0_LOBIT_I_I34:%.*]] = lshr i32 [[P1_ADDR_0_I21_I30]], 31
-; CHECK-NEXT:    [[AND6_I_I35:%.*]] = and i32 [[P1_ADDR_0_I21_I30]], 1
-; CHECK-NEXT:    [[OR_I22_I36:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I_I34]], [[AND6_I_I35]]
-; CHECK-NEXT:    br label [[IF_END7_I_I39]]
-; CHECK:       if.end7.i.i39:
-; CHECK-NEXT:    [[P1_ADDR_1_I_I38:%.*]] = phi i32 [ [[OR_I22_I36]], [[IF_THEN3_I_I37]] ], [ [[P1_ADDR_0_I21_I30]], [[IF_END_I_I33]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I19_I]], label [[IF_THEN10_I_I42:%.*]], label [[IF_END13_I_I46:%.*]]
-; CHECK:       if.then10.i.i42:
-; CHECK-NEXT:    [[CMP11_I_I40:%.*]] = icmp sgt i32 [[P1_ADDR_1_I_I38]], 0
-; CHECK-NEXT:    [[CONV12_I_I41:%.*]] = zext i1 [[CMP11_I_I40]] to i32
-; CHECK-NEXT:    br label [[IF_END13_I_I46]]
-; CHECK:       if.end13.i.i46:
-; CHECK-NEXT:    [[P1_ADDR_2_I_I43:%.*]] = phi i32 [ [[CONV12_I_I41]], [[IF_THEN10_I_I42]] ], [ [[P1_ADDR_1_I_I38]], [[IF_END7_I_I39]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL15_I_I]], label [[_Z3FN2III_EXIT_I52:%.*]], label [[IF_THEN16_I_I49:%.*]]
-; CHECK:       if.then16.i.i49:
-; CHECK-NEXT:    [[AND17_I_I47:%.*]] = lshr i32 [[P1_ADDR_2_I_I43]], 3
-; CHECK-NEXT:    [[AND17_LOBIT_I_I48:%.*]] = and i32 [[AND17_I_I47]], 1
-; CHECK-NEXT:    br label [[_Z3FN2III_EXIT_I52]]
-; CHECK:       _Z3fn2iii.exit.i52:
-; CHECK-NEXT:    [[P1_ADDR_3_I_I50:%.*]] = phi i32 [ [[AND17_LOBIT_I_I48]], [[IF_THEN16_I_I49]] ], [ [[P1_ADDR_2_I_I43]], [[IF_END13_I_I46]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL_I11_I]], label [[_ZL3FN3II_EXIT58]], label [[IF_THEN_I15_I56:%.*]]
-; CHECK:       if.then.i15.i56:
-; CHECK-NEXT:    [[AND_I12_I53:%.*]] = lshr i32 [[P1_ADDR_3_I_I50]], 2
-; CHECK-NEXT:    [[AND_LOBIT_I13_I54:%.*]] = and i32 [[AND_I12_I53]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i32 [[AND_LOBIT_I13_I54]], 1
-; CHECK-NEXT:    [[OR_I14_I55:%.*]] = or i32 [[TMP12]], [[P1_ADDR_3_I_I50]]
-; CHECK-NEXT:    br label [[_ZL3FN3II_EXIT58]]
-; CHECK:       _ZL3fn3ii.exit58:
-; CHECK-NEXT:    [[P1_ADDR_0_I16_I57:%.*]] = phi i32 [ [[OR_I14_I55]], [[IF_THEN_I15_I56]] ], [ [[P1_ADDR_3_I_I50]], [[_Z3FN2III_EXIT_I52]] ]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[P1_ADDR_0_I16_I57]], i32* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[P1]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   %cmp88 = icmp sgt i32 %p1, 0
   br i1 %cmp88, label %for.body.lr.ph, label %for.end

diff  --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 67f739cda6721..ad2e5d5644e04 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -48,7 +48,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP17]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -76,7 +76,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:

diff  --git a/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll b/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
index 94169483df557..f8045edcfb7dc 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -6,340 +5,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Test no-predication of instructions that are provably safe, e.g. dividing by
 ; a non-zero constant.
 define void @test(i32* nocapture %asd, i32* nocapture %aud,
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; CHECK-NEXT:    [[AUD3:%.*]] = bitcast i32* [[AUD:%.*]] to i8*
-; CHECK-NEXT:    [[ASR6:%.*]] = bitcast i32* [[ASR:%.*]] to i8*
-; CHECK-NEXT:    [[AUR9:%.*]] = bitcast i32* [[AUR:%.*]] to i8*
-; CHECK-NEXT:    [[ASD012:%.*]] = bitcast i32* [[ASD0:%.*]] to i8*
-; CHECK-NEXT:    [[AUD015:%.*]] = bitcast i32* [[AUD0:%.*]] to i8*
-; CHECK-NEXT:    [[ASR018:%.*]] = bitcast i32* [[ASR0:%.*]] to i8*
-; CHECK-NEXT:    [[AUR021:%.*]] = bitcast i32* [[AUR0:%.*]] to i8*
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[AUD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[ASR]], i64 128
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr i32, i32* [[AUR]], i64 128
-; CHECK-NEXT:    [[SCEVGEP1011:%.*]] = bitcast i32* [[SCEVGEP10]] to i8*
-; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr i32, i32* [[ASD0]], i64 128
-; CHECK-NEXT:    [[SCEVGEP1314:%.*]] = bitcast i32* [[SCEVGEP13]] to i8*
-; CHECK-NEXT:    [[SCEVGEP16:%.*]] = getelementptr i32, i32* [[AUD0]], i64 128
-; CHECK-NEXT:    [[SCEVGEP1617:%.*]] = bitcast i32* [[SCEVGEP16]] to i8*
-; CHECK-NEXT:    [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[ASR0]], i64 128
-; CHECK-NEXT:    [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8*
-; CHECK-NEXT:    [[SCEVGEP22:%.*]] = getelementptr i32, i32* [[AUR0]], i64 128
-; CHECK-NEXT:    [[SCEVGEP2223:%.*]] = bitcast i32* [[SCEVGEP22]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT26]]
-; CHECK-NEXT:    [[BOUND027:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND128:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]]
-; CHECK-NEXT:    [[CONFLICT_RDX30:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT29]]
-; CHECK-NEXT:    [[BOUND031:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND132:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT33:%.*]] = and i1 [[BOUND031]], [[BOUND132]]
-; CHECK-NEXT:    [[CONFLICT_RDX34:%.*]] = or i1 [[CONFLICT_RDX30]], [[FOUND_CONFLICT33]]
-; CHECK-NEXT:    [[BOUND035:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[BOUND136:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT37:%.*]] = and i1 [[BOUND035]], [[BOUND136]]
-; CHECK-NEXT:    [[CONFLICT_RDX38:%.*]] = or i1 [[CONFLICT_RDX34]], [[FOUND_CONFLICT37]]
-; CHECK-NEXT:    [[BOUND039:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[BOUND140:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT41:%.*]] = and i1 [[BOUND039]], [[BOUND140]]
-; CHECK-NEXT:    [[CONFLICT_RDX42:%.*]] = or i1 [[CONFLICT_RDX38]], [[FOUND_CONFLICT41]]
-; CHECK-NEXT:    [[BOUND043:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND144:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT45:%.*]] = and i1 [[BOUND043]], [[BOUND144]]
-; CHECK-NEXT:    [[CONFLICT_RDX46:%.*]] = or i1 [[CONFLICT_RDX42]], [[FOUND_CONFLICT45]]
-; CHECK-NEXT:    [[BOUND047:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND148:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT49:%.*]] = and i1 [[BOUND047]], [[BOUND148]]
-; CHECK-NEXT:    [[CONFLICT_RDX50:%.*]] = or i1 [[CONFLICT_RDX46]], [[FOUND_CONFLICT49]]
-; CHECK-NEXT:    [[BOUND051:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND152:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT53:%.*]] = and i1 [[BOUND051]], [[BOUND152]]
-; CHECK-NEXT:    [[CONFLICT_RDX54:%.*]] = or i1 [[CONFLICT_RDX50]], [[FOUND_CONFLICT53]]
-; CHECK-NEXT:    [[BOUND055:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND156:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT57:%.*]] = and i1 [[BOUND055]], [[BOUND156]]
-; CHECK-NEXT:    [[CONFLICT_RDX58:%.*]] = or i1 [[CONFLICT_RDX54]], [[FOUND_CONFLICT57]]
-; CHECK-NEXT:    [[BOUND059:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[BOUND160:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT61:%.*]] = and i1 [[BOUND059]], [[BOUND160]]
-; CHECK-NEXT:    [[CONFLICT_RDX62:%.*]] = or i1 [[CONFLICT_RDX58]], [[FOUND_CONFLICT61]]
-; CHECK-NEXT:    [[BOUND063:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[BOUND164:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT65:%.*]] = and i1 [[BOUND063]], [[BOUND164]]
-; CHECK-NEXT:    [[CONFLICT_RDX66:%.*]] = or i1 [[CONFLICT_RDX62]], [[FOUND_CONFLICT65]]
-; CHECK-NEXT:    [[BOUND067:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND168:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT69:%.*]] = and i1 [[BOUND067]], [[BOUND168]]
-; CHECK-NEXT:    [[CONFLICT_RDX70:%.*]] = or i1 [[CONFLICT_RDX66]], [[FOUND_CONFLICT69]]
-; CHECK-NEXT:    [[BOUND071:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND172:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT73:%.*]] = and i1 [[BOUND071]], [[BOUND172]]
-; CHECK-NEXT:    [[CONFLICT_RDX74:%.*]] = or i1 [[CONFLICT_RDX70]], [[FOUND_CONFLICT73]]
-; CHECK-NEXT:    [[BOUND075:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND176:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT77:%.*]] = and i1 [[BOUND075]], [[BOUND176]]
-; CHECK-NEXT:    [[CONFLICT_RDX78:%.*]] = or i1 [[CONFLICT_RDX74]], [[FOUND_CONFLICT77]]
-; CHECK-NEXT:    [[BOUND079:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[BOUND180:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT81:%.*]] = and i1 [[BOUND079]], [[BOUND180]]
-; CHECK-NEXT:    [[CONFLICT_RDX82:%.*]] = or i1 [[CONFLICT_RDX78]], [[FOUND_CONFLICT81]]
-; CHECK-NEXT:    [[BOUND083:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[BOUND184:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT85:%.*]] = and i1 [[BOUND083]], [[BOUND184]]
-; CHECK-NEXT:    [[CONFLICT_RDX86:%.*]] = or i1 [[CONFLICT_RDX82]], [[FOUND_CONFLICT85]]
-; CHECK-NEXT:    [[BOUND087:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND188:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT89:%.*]] = and i1 [[BOUND087]], [[BOUND188]]
-; CHECK-NEXT:    [[CONFLICT_RDX90:%.*]] = or i1 [[CONFLICT_RDX86]], [[FOUND_CONFLICT89]]
-; CHECK-NEXT:    [[BOUND091:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[BOUND192:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[FOUND_CONFLICT93:%.*]] = and i1 [[BOUND091]], [[BOUND192]]
-; CHECK-NEXT:    [[CONFLICT_RDX94:%.*]] = or i1 [[CONFLICT_RDX90]], [[FOUND_CONFLICT93]]
-; CHECK-NEXT:    [[BOUND095:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[BOUND196:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[FOUND_CONFLICT97:%.*]] = and i1 [[BOUND095]], [[BOUND196]]
-; CHECK-NEXT:    [[CONFLICT_RDX98:%.*]] = or i1 [[CONFLICT_RDX94]], [[FOUND_CONFLICT97]]
-; CHECK-NEXT:    [[BOUND099:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[BOUND1100:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[FOUND_CONFLICT101:%.*]] = and i1 [[BOUND099]], [[BOUND1100]]
-; CHECK-NEXT:    [[CONFLICT_RDX102:%.*]] = or i1 [[CONFLICT_RDX98]], [[FOUND_CONFLICT101]]
-; CHECK-NEXT:    [[BOUND0103:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND1104:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[FOUND_CONFLICT105:%.*]] = and i1 [[BOUND0103]], [[BOUND1104]]
-; CHECK-NEXT:    [[CONFLICT_RDX106:%.*]] = or i1 [[CONFLICT_RDX102]], [[FOUND_CONFLICT105]]
-; CHECK-NEXT:    [[BOUND0107:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[BOUND1108:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[FOUND_CONFLICT109:%.*]] = and i1 [[BOUND0107]], [[BOUND1108]]
-; CHECK-NEXT:    [[CONFLICT_RDX110:%.*]] = or i1 [[CONFLICT_RDX106]], [[FOUND_CONFLICT109]]
-; CHECK-NEXT:    [[BOUND0111:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[BOUND1112:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[FOUND_CONFLICT113:%.*]] = and i1 [[BOUND0111]], [[BOUND1112]]
-; CHECK-NEXT:    [[CONFLICT_RDX114:%.*]] = or i1 [[CONFLICT_RDX110]], [[FOUND_CONFLICT113]]
-; CHECK-NEXT:    [[BOUND0115:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND1116:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[FOUND_CONFLICT117:%.*]] = and i1 [[BOUND0115]], [[BOUND1116]]
-; CHECK-NEXT:    [[CONFLICT_RDX118:%.*]] = or i1 [[CONFLICT_RDX114]], [[FOUND_CONFLICT117]]
-; CHECK-NEXT:    [[BOUND0119:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[BOUND1120:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[FOUND_CONFLICT121:%.*]] = and i1 [[BOUND0119]], [[BOUND1120]]
-; CHECK-NEXT:    [[CONFLICT_RDX122:%.*]] = or i1 [[CONFLICT_RDX118]], [[FOUND_CONFLICT121]]
-; CHECK-NEXT:    [[BOUND0123:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND1124:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1617]]
-; CHECK-NEXT:    [[FOUND_CONFLICT125:%.*]] = and i1 [[BOUND0123]], [[BOUND1124]]
-; CHECK-NEXT:    [[CONFLICT_RDX126:%.*]] = or i1 [[CONFLICT_RDX122]], [[FOUND_CONFLICT125]]
-; CHECK-NEXT:    [[BOUND0127:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP2223]]
-; CHECK-NEXT:    [[BOUND1128:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1920]]
-; CHECK-NEXT:    [[FOUND_CONFLICT129:%.*]] = and i1 [[BOUND0127]], [[BOUND1128]]
-; CHECK-NEXT:    [[CONFLICT_RDX130:%.*]] = or i1 [[CONFLICT_RDX126]], [[FOUND_CONFLICT129]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX130]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE139:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD131:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !alias.scope !11, !noalias !12
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD132:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4, !alias.scope !13, !noalias !14
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD133:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4, !alias.scope !15, !noalias !16
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <2 x i32> [[WIDE_LOAD131]], <i32 24, i32 24>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i32> [[WIDE_LOAD132]], <i32 25, i32 25>
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <2 x i32> [[WIDE_LOAD133]], <i32 26, i32 26>
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[ASD0]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[AUD0]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[ASR0]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[AUR0]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD134:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4, !alias.scope !17, !noalias !18
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD135:%.*]] = load <2 x i32>, <2 x i32>* [[TMP24]], align 4, !alias.scope !19, !noalias !20
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD136:%.*]] = load <2 x i32>, <2 x i32>* [[TMP26]], align 4, !alias.scope !21, !noalias !22
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD137:%.*]] = load <2 x i32>, <2 x i32>* [[TMP28]], align 4, !alias.scope !22
-; CHECK-NEXT:    [[TMP29:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 27, i32 27>
-; CHECK-NEXT:    [[TMP30:%.*]] = add nsw <2 x i32> [[WIDE_LOAD131]], <i32 28, i32 28>
-; CHECK-NEXT:    [[TMP31:%.*]] = add nsw <2 x i32> [[WIDE_LOAD132]], <i32 29, i32 29>
-; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <2 x i32> [[WIDE_LOAD133]], <i32 30, i32 30>
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
-; CHECK-NEXT:    [[TMP34:%.*]] = sdiv <2 x i32> [[TMP13]], <i32 11, i32 11>
-; CHECK-NEXT:    [[TMP35:%.*]] = udiv <2 x i32> [[TMP14]], <i32 13, i32 13>
-; CHECK-NEXT:    [[TMP36:%.*]] = srem <2 x i32> [[TMP15]], <i32 17, i32 17>
-; CHECK-NEXT:    [[TMP37:%.*]] = urem <2 x i32> [[TMP16]], <i32 19, i32 19>
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP33]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
-; CHECK:       pred.urem.if:
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = sdiv i32 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> poison, i32 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0
-; CHECK-NEXT:    [[TMP43:%.*]] = udiv i32 [[TMP42]], 0
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[TMP43]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP31]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = srem i32 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> poison, i32 [[TMP46]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP49:%.*]] = urem i32 [[TMP48]], 0
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <2 x i32> poison, i32 [[TMP49]], i32 0
-; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE]]
-; CHECK:       pred.urem.continue:
-; CHECK-NEXT:    [[TMP51:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP41]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP52:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP44]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP47]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP54:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP50]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i32 1
-; CHECK-NEXT:    br i1 [[TMP55]], label [[PRED_UREM_IF138:%.*]], label [[PRED_UREM_CONTINUE139]]
-; CHECK:       pred.urem.if138:
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <2 x i32> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP57:%.*]] = sdiv i32 [[TMP56]], 0
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <2 x i32> [[TMP51]], i32 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1
-; CHECK-NEXT:    [[TMP60:%.*]] = udiv i32 [[TMP59]], 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <2 x i32> [[TMP52]], i32 [[TMP60]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <2 x i32> [[TMP31]], i32 1
-; CHECK-NEXT:    [[TMP63:%.*]] = srem i32 [[TMP62]], 0
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP53]], i32 [[TMP63]], i32 1
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <2 x i32> [[TMP32]], i32 1
-; CHECK-NEXT:    [[TMP66:%.*]] = urem i32 [[TMP65]], 0
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP66]], i32 1
-; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE139]]
-; CHECK:       pred.urem.continue139:
-; CHECK-NEXT:    [[TMP68:%.*]] = phi <2 x i32> [ [[TMP51]], [[PRED_UREM_CONTINUE]] ], [ [[TMP58]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT:    [[TMP69:%.*]] = phi <2 x i32> [ [[TMP52]], [[PRED_UREM_CONTINUE]] ], [ [[TMP61]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT:    [[TMP70:%.*]] = phi <2 x i32> [ [[TMP53]], [[PRED_UREM_CONTINUE]] ], [ [[TMP64]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT:    [[TMP71:%.*]] = phi <2 x i32> [ [[TMP54]], [[PRED_UREM_CONTINUE]] ], [ [[TMP67]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT:    [[TMP72:%.*]] = xor <2 x i1> [[TMP33]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP13]], <2 x i32> [[TMP34]]
-; CHECK-NEXT:    [[PREDPHI140:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP14]], <2 x i32> [[TMP35]]
-; CHECK-NEXT:    [[PREDPHI141:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP15]], <2 x i32> [[TMP36]]
-; CHECK-NEXT:    [[PREDPHI142:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP16]], <2 x i32> [[TMP37]]
-; CHECK-NEXT:    [[PREDPHI143:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP29]], <2 x i32> [[TMP68]]
-; CHECK-NEXT:    [[PREDPHI144:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP30]], <2 x i32> [[TMP69]]
-; CHECK-NEXT:    [[PREDPHI145:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP31]], <2 x i32> [[TMP70]]
-; CHECK-NEXT:    [[PREDPHI146:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP32]], <2 x i32> [[TMP71]]
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP74]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI140]], <2 x i32>* [[TMP76]], align 4, !alias.scope !11, !noalias !12
-; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP78:%.*]] = bitcast i32* [[TMP77]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI141]], <2 x i32>* [[TMP78]], align 4, !alias.scope !13, !noalias !14
-; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP80:%.*]] = bitcast i32* [[TMP79]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI142]], <2 x i32>* [[TMP80]], align 4, !alias.scope !15, !noalias !16
-; CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = bitcast i32* [[TMP81]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI143]], <2 x i32>* [[TMP82]], align 4, !alias.scope !17, !noalias !18
-; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP84:%.*]] = bitcast i32* [[TMP83]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI144]], <2 x i32>* [[TMP84]], align 4, !alias.scope !19, !noalias !20
-; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP86:%.*]] = bitcast i32* [[TMP85]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI145]], <2 x i32>* [[TMP86]], align 4, !alias.scope !21, !noalias !22
-; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i32* [[TMP87]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI146]], <2 x i32>* [[TMP88]], align 4, !alias.scope !22
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP89]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[IUD:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[ISR:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[IUR:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; CHECK-NEXT:    [[LUD:%.*]] = load i32, i32* [[IUD]], align 4
-; CHECK-NEXT:    [[LSR:%.*]] = load i32, i32* [[ISR]], align 4
-; CHECK-NEXT:    [[LUR:%.*]] = load i32, i32* [[IUR]], align 4
-; CHECK-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
-; CHECK-NEXT:    [[PUD:%.*]] = add nsw i32 [[LUD]], 24
-; CHECK-NEXT:    [[PSR:%.*]] = add nsw i32 [[LSR]], 25
-; CHECK-NEXT:    [[PUR:%.*]] = add nsw i32 [[LUR]], 26
-; CHECK-NEXT:    [[ISD0:%.*]] = getelementptr inbounds i32, i32* [[ASD0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[IUD0:%.*]] = getelementptr inbounds i32, i32* [[AUD0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[ISR0:%.*]] = getelementptr inbounds i32, i32* [[ASR0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[IUR0:%.*]] = getelementptr inbounds i32, i32* [[AUR0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD0:%.*]] = load i32, i32* [[ISD0]], align 4
-; CHECK-NEXT:    [[LUD0:%.*]] = load i32, i32* [[IUD0]], align 4
-; CHECK-NEXT:    [[LSR0:%.*]] = load i32, i32* [[ISR0]], align 4
-; CHECK-NEXT:    [[LUR0:%.*]] = load i32, i32* [[IUR0]], align 4
-; CHECK-NEXT:    [[PSD0:%.*]] = add nsw i32 [[LSD]], 27
-; CHECK-NEXT:    [[PUD0:%.*]] = add nsw i32 [[LUD]], 28
-; CHECK-NEXT:    [[PSR0:%.*]] = add nsw i32 [[LSR]], 29
-; CHECK-NEXT:    [[PUR0:%.*]] = add nsw i32 [[LUR]], 30
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[RSD:%.*]] = sdiv i32 [[PSD]], 11
-; CHECK-NEXT:    [[RUD:%.*]] = udiv i32 [[PUD]], 13
-; CHECK-NEXT:    [[RSR:%.*]] = srem i32 [[PSR]], 17
-; CHECK-NEXT:    [[RUR:%.*]] = urem i32 [[PUR]], 19
-; CHECK-NEXT:    [[RSD0:%.*]] = sdiv i32 [[PSD0]], 0
-; CHECK-NEXT:    [[RUD0:%.*]] = udiv i32 [[PUD0]], 0
-; CHECK-NEXT:    [[RSR0:%.*]] = srem i32 [[PSR0]], 0
-; CHECK-NEXT:    [[RUR0:%.*]] = urem i32 [[PUR0]], 0
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YUD_0:%.*]] = phi i32 [ [[RUD]], [[IF_THEN]] ], [ [[PUD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YSR_0:%.*]] = phi i32 [ [[RSR]], [[IF_THEN]] ], [ [[PSR]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YUR_0:%.*]] = phi i32 [ [[RUR]], [[IF_THEN]] ], [ [[PUR]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YSD0_0:%.*]] = phi i32 [ [[RSD0]], [[IF_THEN]] ], [ [[PSD0]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YUD0_0:%.*]] = phi i32 [ [[RUD0]], [[IF_THEN]] ], [ [[PUD0]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YSR0_0:%.*]] = phi i32 [ [[RSR0]], [[IF_THEN]] ], [ [[PSR0]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[YUR0_0:%.*]] = phi i32 [ [[RUR0]], [[IF_THEN]] ], [ [[PUR0]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
-; CHECK-NEXT:    store i32 [[YUD_0]], i32* [[IUD]], align 4
-; CHECK-NEXT:    store i32 [[YSR_0]], i32* [[ISR]], align 4
-; CHECK-NEXT:    store i32 [[YUR_0]], i32* [[IUR]], align 4
-; CHECK-NEXT:    store i32 [[YSD0_0]], i32* [[ISD0]], align 4
-; CHECK-NEXT:    store i32 [[YUD0_0]], i32* [[IUD0]], align 4
-; CHECK-NEXT:    store i32 [[YSR0_0]], i32* [[ISR0]], align 4
-; CHECK-NEXT:    store i32 [[YUR0_0]], i32* [[IUR0]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-;
-  i32* nocapture %asr, i32* nocapture %aur,
-  i32* nocapture %asd0, i32* nocapture %aud0,
-  i32* nocapture %asr0, i32* nocapture %aur0
+                  i32* nocapture %asr, i32* nocapture %aur,
+                  i32* nocapture %asd0, i32* nocapture %aud0,
+                  i32* nocapture %asr0, i32* nocapture %aur0
 ) {
 entry:
   br label %for.body
@@ -347,6 +15,16 @@ entry:
 for.cond.cleanup:                                 ; preds = %if.end
   ret void
 
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 11, i32 11>
+; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 13, i32 13>
+; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 17, i32 17>
+; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 19, i32 19>
+; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 0, i32 0>
 
 for.body:                                         ; preds = %if.end, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]

diff  --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 012ef251f4c1d..c736c05535046 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -212,7 +212,7 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
 ; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; UNROLL-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
 ; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]]
 ; UNROLL:       pred.store.if:
@@ -220,18 +220,18 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]]
 ; UNROLL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
 ; UNROLL-NEXT:    store i32 [[TMP6]], i32* [[TMP5]], align 4
-; UNROLL-NEXT:    [[INDUCTION2:%.*]] = add i64 [[OFFSET_IDX]], 1
-; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION2]]
+; UNROLL-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1
+; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
 ; UNROLL-NEXT:    store i32 [[TMP8]], i32* [[TMP7]], align 4
 ; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; UNROLL:       pred.store.continue4:
 ; UNROLL-NEXT:    [[TMP9:%.*]] = add i32 [[VEC_PHI]], 1
-; UNROLL-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI1]], 1
+; UNROLL-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI2]], 1
 ; UNROLL-NEXT:    [[TMP11:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP11]], i32 [[VEC_PHI]], i32 [[TMP9]]
-; UNROLL-NEXT:    [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP10]]
+; UNROLL-NEXT:    [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI2]], i32 [[TMP10]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -284,7 +284,7 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
@@ -296,18 +296,18 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if3:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[OFFSET_IDX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION2]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP7]], i32* [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue4:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = add i32 [[VEC_PHI]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = add i32 [[VEC_PHI1]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = add i32 [[VEC_PHI2]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP10]], i32 [[VEC_PHI]], i32 [[TMP8]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI1]], i32 [[TMP9]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI2]], i32 [[TMP9]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -460,22 +460,22 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL:       pred.store.if:
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
+; UNROLL-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
 ; UNROLL-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; UNROLL-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
 ; UNROLL-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
+; UNROLL-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION4]]
 ; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
 ; UNROLL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; UNROLL-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
 ; UNROLL-NEXT:    store i8 [[TMP7]], i8* [[TMP4]], align 1
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; UNROLL:       pred.store.continue4:
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
@@ -508,18 +508,18 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY:       vector.ph:
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
@@ -527,7 +527,7 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP4]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue4:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
@@ -566,34 +566,34 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; VEC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[TMP1]], i32 0
-; VEC-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>*
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1
-; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0
+; VEC-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>*
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1
+; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; VEC-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; VEC:       pred.store.if:
-; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
-; VEC-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; VEC-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; VEC-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT:    store i8 [[TMP7]], i8* [[TMP8]], align 1
+; VEC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
+; VEC-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; VEC-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; VEC-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
+; VEC-NEXT:    store i8 [[TMP8]], i8* [[TMP9]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; VEC-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; VEC-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; VEC:       pred.store.if2:
-; VEC-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
-; VEC-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP11]] to i32
-; VEC-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
-; VEC-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP10]]
-; VEC-NEXT:    store i8 [[TMP13]], i8* [[TMP14]], align 1
+; VEC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
+; VEC-NEXT:    [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
+; VEC-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
+; VEC-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
+; VEC-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* undef, i64 [[TMP11]]
+; VEC-NEXT:    store i8 [[TMP14]], i8* [[TMP15]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; VEC:       pred.store.continue3:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; VEC-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VEC-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; VEC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VEC:       middle.block:
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
@@ -672,7 +672,7 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) {
 ; UNROLL-NOSIMPLIFY:       vector.ph:
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[INDUCTION]]
@@ -687,13 +687,13 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) {
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP4]], i8* [[TMP0]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP1]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue4:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0

diff  --git a/llvm/test/Transforms/LoopVectorize/induction-ptrcasts.ll b/llvm/test/Transforms/LoopVectorize/induction-ptrcasts.ll
index 22849227c9541..6a02a371eafb1 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-ptrcasts.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-ptrcasts.ll
@@ -6,96 +6,32 @@
 
 define void @int_iv_based_on_pointer_iv(i8* %A) {
 ; VF1-LABEL: @int_iv_based_on_pointer_iv(
-; VF1-NEXT:  entry:
-; VF1-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 add (i64 ptrtoint (i32* @f to i64), i64 -4), i64 0)
-; VF1-NEXT:    [[TMP0:%.*]] = sub i64 add (i64 ptrtoint (i32* @f to i64), i64 -1), [[SMIN]]
-; VF1-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; VF1-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; VF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
-; VF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; VF1:       vector.ph:
-; VF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
-; VF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; VF1-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
-; VF1-NEXT:    [[IND_END2:%.*]] = getelementptr i32, i32* null, i64 [[N_VEC]]
-; VF1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF1:       vector.body:
-; VF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; VF1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; VF1-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF1-NEXT:    [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDUCTION]]
-; VF1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDUCTION3]]
-; VF1-NEXT:    store i8 0, i8* [[TMP3]], align 1
-; VF1-NEXT:    store i8 0, i8* [[TMP4]], align 1
+; VF1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDUCTION]]
+; VF1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDUCTION3]]
+; VF1-NEXT:    store i8 0, i8* [[TMP7]], align 1
+; VF1-NEXT:    store i8 0, i8* [[TMP8]], align 1
 ; VF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; VF1:       middle.block:
-; VF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; VF1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; VF1:       scalar.ph:
-; VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ null, [[ENTRY]] ]
-; VF1-NEXT:    br label [[LOOP:%.*]]
-; VF1:       loop:
-; VF1-NEXT:    [[IV_INT:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[LOOP]] ]
-; VF1-NEXT:    [[IV_PTR:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ]
-; VF1-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, i32* [[IV_PTR]], i64 1
-; VF1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV_INT]]
-; VF1-NEXT:    store i8 0, i8* [[GEP_A]], align 1
-; VF1-NEXT:    [[IV_INT_NEXT]] = ptrtoint i32* [[IV_PTR_NEXT]] to i64
-; VF1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 ptrtoint (i32* @f to i64), [[IV_INT_NEXT]]
-; VF1-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB]], 0
-; VF1-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; VF1:       exit:
-; VF1-NEXT:    ret void
+; VF1-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]],
+; VF1-NEXT:    br i1 [[TMP13]], label %middle.block, label %vector.body
 ;
 ; VF2-LABEL: @int_iv_based_on_pointer_iv(
-; VF2-NEXT:  entry:
-; VF2-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 add (i64 ptrtoint (i32* @f to i64), i64 -4), i64 0)
-; VF2-NEXT:    [[TMP0:%.*]] = sub i64 add (i64 ptrtoint (i32* @f to i64), i64 -1), [[SMIN]]
-; VF2-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; VF2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
-; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; VF2:       vector.ph:
-; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
-; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; VF2-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
-; VF2-NEXT:    [[IND_END2:%.*]] = getelementptr i32, i32* null, i64 [[N_VEC]]
-; VF2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF2:       vector.body:
-; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; VF2-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP4]]
-; VF2-NEXT:    store i8 0, i8* [[TMP5]], align 1
-; VF2-NEXT:    store i8 0, i8* [[TMP6]], align 1
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP4]]
+; VF2-NEXT:    store i8 0, i8* [[TMP9]], align 1
+; VF2-NEXT:    store i8 0, i8* [[TMP10]], align 1
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF2-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; VF2:       middle.block:
-; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; VF2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ null, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV_INT:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV_PTR:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, i32* [[IV_PTR]], i64 1
-; VF2-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV_INT]]
-; VF2-NEXT:    store i8 0, i8* [[GEP_A]], align 1
-; VF2-NEXT:    [[IV_INT_NEXT]] = ptrtoint i32* [[IV_PTR_NEXT]] to i64
-; VF2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 ptrtoint (i32* @f to i64), [[IV_INT_NEXT]]
-; VF2-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB]], 0
-; VF2-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
+; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]],
+; VF2-NEXT:    br i1 [[TMP14]], label %middle.block, label %vector.body
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 982ddb36f4710..4bfb94fc87fbd 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -2166,12 +2166,12 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
+; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ]
-; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UDIV_CONTINUE10]] ]
-; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE11:%.*]] ]
+; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UDIV_CONTINUE11]] ]
+; UNROLL-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UDIV_CONTINUE11]] ]
 ; UNROLL-NEXT:    [[TMP0:%.*]] = or i32 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
@@ -2179,7 +2179,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
 ; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 2
 ; UNROLL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
+; UNROLL-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL:       pred.udiv.if:
 ; UNROLL-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
@@ -2188,40 +2188,40 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL:       pred.udiv.continue:
 ; UNROLL-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
-; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; UNROLL:       pred.udiv.if3:
 ; UNROLL-NEXT:    [[TMP10:%.*]] = or i32 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1
 ; UNROLL-NEXT:    [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
 ; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i64 1
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; UNROLL:       pred.udiv.continue4:
-; UNROLL-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; UNROLL:       pred.udiv.if7:
-; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0
+; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i64 0
 ; UNROLL-NEXT:    [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP0]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; UNROLL:       pred.udiv.continue8:
-; UNROLL-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE5]] ], [ [[TMP17]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL:       pred.udiv.if9:
 ; UNROLL-NEXT:    [[TMP19:%.*]] = or i32 [[INDEX]], 3
-; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1
+; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i64 1
 ; UNROLL-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP19]]
 ; UNROLL-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[TMP21]], i64 1
-; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL:       pred.udiv.continue10:
-; UNROLL-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP22]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
 ; UNROLL-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[TMP26:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP26:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT6]], <i1 true, i1 poison>
 ; UNROLL-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i1> [[TMP26]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NEXT:    [[PREDPHI11:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP23]]
+; UNROLL-NEXT:    [[PREDPHI12:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD3]], <2 x i32> [[TMP23]]
 ; UNROLL-NEXT:    [[TMP28]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
-; UNROLL-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
+; UNROLL-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI12]], [[VEC_PHI2]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
@@ -2264,13 +2264,13 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i32 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT5]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i32 0
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT6]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE10]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE11:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE11]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE11]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
@@ -2280,7 +2280,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP8]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
@@ -2291,40 +2291,40 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC:       pred.udiv.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if3:
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP14]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; UNROLL-NO-IC:       pred.udiv.continue4:
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT7]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if7:
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT7]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = add i32 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = udiv i32 [[TMP26]], [[TMP25]]
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP27]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
 ; UNROLL-NO-IC:       pred.udiv.continue10:
-; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP28]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP28]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], <i1 true, i1 true>
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT7]], <i1 true, i1 true>
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI11:%.*]] = select <2 x i1> [[TMP31]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP29]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI12:%.*]] = select <2 x i1> [[TMP31]], <2 x i32> [[WIDE_LOAD3]], <2 x i32> [[TMP29]]
 ; UNROLL-NO-IC-NEXT:    [[TMP32]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT:    [[TMP33]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
+; UNROLL-NO-IC-NEXT:    [[TMP33]] = add <2 x i32> [[PREDPHI12]], [[VEC_PHI2]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
@@ -2364,12 +2364,12 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
-; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE19:%.*]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UDIV_CONTINUE19]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = or i32 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
@@ -2377,7 +2377,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; INTERLEAVE:       pred.udiv.if:
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
@@ -2386,76 +2386,76 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; INTERLEAVE:       pred.udiv.continue:
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; INTERLEAVE:       pred.udiv.if3:
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = or i32 [[INDEX]], 1
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP12]], i64 1
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; INTERLEAVE:       pred.udiv.continue4:
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF4]] ]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
 ; INTERLEAVE:       pred.udiv.if5:
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = or i32 [[INDEX]], 2
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = udiv i32 [[TMP16]], [[TMP15]]
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP17]], i64 2
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
 ; INTERLEAVE:       pred.udiv.continue6:
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP18]], [[PRED_UDIV_IF5]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP18]], [[PRED_UDIV_IF6]] ]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; INTERLEAVE:       pred.udiv.if7:
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = or i32 [[INDEX]], 3
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
 ; INTERLEAVE-NEXT:    [[TMP22:%.*]] = udiv i32 [[TMP21]], [[TMP20]]
 ; INTERLEAVE-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP22]], i64 3
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; INTERLEAVE:       pred.udiv.continue8:
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP23]], [[PRED_UDIV_IF7]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP23]], [[PRED_UDIV_IF8]] ]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
 ; INTERLEAVE:       pred.udiv.if11:
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 0
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = udiv i32 [[TMP25]], [[TMP0]]
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i64 0
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
 ; INTERLEAVE:       pred.udiv.continue12:
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP27]], [[PRED_UDIV_IF12]] ]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
 ; INTERLEAVE:       pred.udiv.if13:
 ; INTERLEAVE-NEXT:    [[TMP29:%.*]] = or i32 [[INDEX]], 5
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1
+; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 1
 ; INTERLEAVE-NEXT:    [[TMP31:%.*]] = udiv i32 [[TMP30]], [[TMP29]]
 ; INTERLEAVE-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i64 1
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
 ; INTERLEAVE:       pred.udiv.continue14:
-; INTERLEAVE-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; INTERLEAVE-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP32]], [[PRED_UDIV_IF14]] ]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
 ; INTERLEAVE:       pred.udiv.if15:
 ; INTERLEAVE-NEXT:    [[TMP34:%.*]] = or i32 [[INDEX]], 6
-; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2
+; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 2
 ; INTERLEAVE-NEXT:    [[TMP36:%.*]] = udiv i32 [[TMP35]], [[TMP34]]
 ; INTERLEAVE-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i64 2
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
 ; INTERLEAVE:       pred.udiv.continue16:
-; INTERLEAVE-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
+; INTERLEAVE-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP37]], [[PRED_UDIV_IF16]] ]
+; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19]]
 ; INTERLEAVE:       pred.udiv.if17:
 ; INTERLEAVE-NEXT:    [[TMP39:%.*]] = or i32 [[INDEX]], 7
-; INTERLEAVE-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3
+; INTERLEAVE-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 3
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = udiv i32 [[TMP40]], [[TMP39]]
 ; INTERLEAVE-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i64 3
-; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
+; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE19]]
 ; INTERLEAVE:       pred.udiv.continue18:
-; INTERLEAVE-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
+; INTERLEAVE-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP42]], [[PRED_UDIV_IF18]] ]
 ; INTERLEAVE-NEXT:    [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
 ; INTERLEAVE-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[TMP46:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP46:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT10]], <i1 true, i1 poison, i1 poison, i1 poison>
 ; INTERLEAVE-NEXT:    [[TMP47:%.*]] = shufflevector <4 x i1> [[TMP46]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP24]]
-; INTERLEAVE-NEXT:    [[PREDPHI19:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP43]]
+; INTERLEAVE-NEXT:    [[PREDPHI20:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD3]], <4 x i32> [[TMP43]]
 ; INTERLEAVE-NEXT:    [[TMP48]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
-; INTERLEAVE-NEXT:    [[TMP49]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]]
+; INTERLEAVE-NEXT:    [[TMP49]] = add <4 x i32> [[PREDPHI20]], [[VEC_PHI2]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
@@ -2539,10 +2539,10 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1
@@ -2551,7 +2551,7 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP10]], i16* [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i32> [[VEC_IND1]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
@@ -2588,9 +2588,9 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
-; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND1]]
 ; IND-NEXT:    [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
 ; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
 ; IND-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1
@@ -2599,7 +2599,7 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    store i16 [[TMP9]], i16* [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; IND-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i32> [[VEC_IND1]], <i32 2, i32 2>
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; IND:       middle.block:
@@ -2633,18 +2633,18 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
+; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 3
-; UNROLL-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLATINSERT2]], <i32 2, i32 poison>
+; UNROLL-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND2]]
+; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLATINSERT6]], <i32 2, i32 poison>
 ; UNROLL-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND]]
+; UNROLL-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND2]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
 ; UNROLL-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i16>
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
@@ -2660,7 +2660,7 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i16> [[TMP11]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP19]], i16* [[TMP15]], align 2
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; UNROLL-NEXT:    [[VEC_IND_NEXT5]] = add <2 x i32> [[VEC_IND2]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; UNROLL:       middle.block:
@@ -2695,19 +2695,19 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT6]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD3:%.*]] = add <2 x i32> [[VEC_IND2]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT3]], [[STEP_ADD]]
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND2]]
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT7]], [[STEP_ADD3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1
@@ -2723,7 +2723,7 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], i16* [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT5]] = add <2 x i32> [[STEP_ADD3]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -2757,11 +2757,11 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934584
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 2
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 3
@@ -2769,10 +2769,10 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 5
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = or i64 [[INDEX]], 6
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 7
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLATINSERT2]], <i32 4, i32 poison, i32 poison, i32 poison>
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND2]]
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLATINSERT6]], <i32 4, i32 poison, i32 poison, i32 poison>
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND2]]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
@@ -2800,7 +2800,7 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <4 x i16> [[TMP15]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP31]], i16* [[TMP23]], align 2
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; INTERLEAVE-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND2]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; INTERLEAVE:       middle.block:
@@ -3555,22 +3555,22 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
 ; CHECK-NEXT:    [[IND_END3:%.*]] = add i32 [[EXT]], [[N_VEC]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION6:%.*]] = add <2 x i32> [[DOTSPLAT5]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT:    [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.block:
@@ -3624,21 +3624,21 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; IND-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
 ; IND-NEXT:    [[IND_END3:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
-; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; IND-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
+; IND-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; IND-NEXT:    [[INDUCTION6:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT5]], <i32 0, i32 1>
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
 ; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP12]], [[T]]
 ; IND-NEXT:    [[TMP13:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP13]]
 ; IND-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP15]], align 4
+; IND-NEXT:    store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP15]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; IND-NEXT:    [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 2, i32 2>
 ; IND-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; IND:       middle.block:
@@ -3693,25 +3693,25 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
 ; UNROLL-NEXT:    [[IND_END3:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
-; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
+; UNROLL-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[INDUCTION7:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT6]], <i32 0, i32 1>
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; UNROLL-NEXT:    [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP12]], [[T]]
 ; UNROLL-NEXT:    [[TMP13:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP13]]
 ; UNROLL-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP15]], align 4
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP15]], align 4
 ; UNROLL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 2
 ; UNROLL-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP17]], align 4
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP17]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; UNROLL-NEXT:    [[VEC_IND_NEXT11]] = add <2 x i32> [[VEC_IND8]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL:       middle.block:
@@ -3768,14 +3768,14 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
 ; UNROLL-NO-IC-NEXT:    [[IND_END3:%.*]] = add i32 [[EXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
-; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
+; UNROLL-NO-IC-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[INDUCTION7:%.*]] = add <2 x i32> [[DOTSPLAT6]], <i32 0, i32 1>
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
@@ -3784,12 +3784,12 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP14]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP18]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP20]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP20]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT11]] = add <2 x i32> [[STEP_ADD9]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -3843,25 +3843,25 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
 ; INTERLEAVE-NEXT:    [[IND_END3:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
-; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; INTERLEAVE-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
+; INTERLEAVE-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[INDUCTION7:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT6]], <i32 0, i32 1, i32 2, i32 3>
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; INTERLEAVE-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[STEP_ADD9:%.*]] = add <4 x i32> [[VEC_IND8]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP12]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP13]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP15]], align 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND8]], <4 x i32>* [[TMP15]], align 4
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 4
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP17]], align 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD9]], <4 x i32>* [[TMP17]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; INTERLEAVE-NEXT:    [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND8]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; INTERLEAVE:       middle.block:
@@ -3951,22 +3951,22 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION6:%.*]] = add <2 x i32> [[DOTSPLAT5]], <i32 0, i32 4>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP17]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; CHECK-NEXT:    [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 8, i32 8>
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4023,21 +4023,21 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
 ; IND-NEXT:    [[TMP12:%.*]] = add i32 [[N_VEC]], [[EXT]]
 ; IND-NEXT:    [[IND_END2:%.*]] = shl i32 [[TMP12]], 2
-; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
-; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; IND-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
+; IND-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; IND-NEXT:    [[INDUCTION6:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT5]], <i32 0, i32 4>
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
 ; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP13]], [[T]]
 ; IND-NEXT:    [[TMP14:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; IND-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP14]]
 ; IND-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; IND-NEXT:    store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP16]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; IND-NEXT:    [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 8, i32 8>
 ; IND-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; IND:       middle.block:
@@ -4095,25 +4095,25 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
 ; UNROLL-NEXT:    [[TMP12:%.*]] = add i32 [[N_VEC]], [[EXT]]
 ; UNROLL-NEXT:    [[IND_END2:%.*]] = shl i32 [[TMP12]], 2
-; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
-; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; UNROLL-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
+; UNROLL-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[INDUCTION7:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT6]], <i32 0, i32 4>
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; UNROLL-NEXT:    [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 8, i32 8>
 ; UNROLL-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP13]], [[T]]
 ; UNROLL-NEXT:    [[TMP14:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP14]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP16]], align 4
 ; UNROLL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i64 2
 ; UNROLL-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP18]], align 4
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP18]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 16, i32 16>
+; UNROLL-NEXT:    [[VEC_IND_NEXT11]] = add <2 x i32> [[VEC_IND8]], <i32 16, i32 16>
 ; UNROLL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL:       middle.block:
@@ -4173,14 +4173,14 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
 ; UNROLL-NO-IC-NEXT:    [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]]
-; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
-; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
+; UNROLL-NO-IC-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[INDUCTION7:%.*]] = add <2 x i32> [[DOTSPLAT6]], <i32 0, i32 4>
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 8, i32 8>
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0
@@ -4189,12 +4189,12 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP19]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP19]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP21]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP21]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 8, i32 8>
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT11]] = add <2 x i32> [[STEP_ADD9]], <i32 8, i32 8>
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -4251,25 +4251,25 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = add i32 [[N_VEC]], [[EXT]]
 ; INTERLEAVE-NEXT:    [[IND_END2:%.*]] = shl i32 [[TMP12]], 2
-; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
-; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 4, i32 8, i32 12>
+; INTERLEAVE-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
+; INTERLEAVE-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[INDUCTION7:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT6]], <i32 0, i32 4, i32 8, i32 12>
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 16, i32 16, i32 16, i32 16>
+; INTERLEAVE-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[STEP_ADD9:%.*]] = add <4 x i32> [[VEC_IND8]], <i32 16, i32 16, i32 16, i32 16>
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP13]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP14]]
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP16]], align 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND8]], <4 x i32>* [[TMP16]], align 4
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i64 4
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP18]], align 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD9]], <4 x i32>* [[TMP18]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32>
+; INTERLEAVE-NEXT:    [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND8]], <i32 32, i32 32, i32 32, i32 32>
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; INTERLEAVE:       middle.block:
@@ -4343,14 +4343,14 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4454,19 +4454,19 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP9]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4558,14 +4558,14 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4692,19 +4692,19 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP11]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add i32 [[TMP5]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP15]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4805,14 +4805,14 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4934,19 +4934,19 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -6131,20 +6131,20 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; CHECK-NEXT:    [[VEC_IND]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP7]], <2 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
@@ -6259,31 +6259,31 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = load i32, i32* [[SRC]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = load i32, i32* [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP3]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[DST]], i32 [[TMP2]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP8]]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP11]], <2 x i32>* [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP5]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[DST]], i32 [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP9]]
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP10]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP12]], <2 x i32>* [[TMP16]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP13]], <2 x i32>* [[TMP16]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP14]], <2 x i32>* [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1

diff  --git a/llvm/test/Transforms/LoopVectorize/induction_plus.ll b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
index b86fdbda36f31..24762ce113147 100644
--- a/llvm/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -7,47 +6,16 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define void @array_at_plus_one(i32 %n) {
 ; CHECK-LABEL: @array_at_plus_one(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 12
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_PLUS_12:%.*]] = add nsw i64 [[IV]], 12
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 [[IV_PLUS_12]]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    store i32 [[IV_TRUNC]], i32* [[GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: [[VEC_IV_TRUNC:%.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ [[VEC_IV_TRUNC_NEXT:%.+]], %vector.body ]
+; CHECK: [[T1:%.+]] = add i64 %index, 0
+; CHECK: [[T2:%.+]] = add nsw i64 [[T1]], 12
+; CHECK-NEXT: [[GEP:%.+]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 [[T2]]
+; CHECK-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* [[GEP]], i32 0
+; CHECK-NEXT: [[BC:%.+]] = bitcast i32* [[GEP0]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[VEC_IV_TRUNC]], <4 x i32>* [[BC]]
+; CHECK: [[VEC_IV_TRUNC_NEXT]] = add <4 x i32> [[VEC_IV_TRUNC]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: ret void
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
index ac592b1ba9cf2..213c30602f095 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
 
 ; When merging two stores with interleaved access vectorization, make sure we
@@ -12,66 +11,6 @@ target triple = "arm64-apple-ios5.0.0"
 %struct.Vec2r = type { double, double }
 
 define void @foobar(%struct.Vec4r* nocapture readonly %p, i32 %i)
-; CHECK-LABEL: @foobar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CP:%.*]] = alloca [20 x %struct.Vec2r], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [20 x %struct.Vec2r]* [[CP]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_VEC4R:%.*]], %struct.Vec4r* [[P:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[TMP3]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[TMP4]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load double, double* [[TMP11]], align 8, !tbaa [[TBAA8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> [[TMP14]], double [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP17]], i32 -1
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP20]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], <4 x double>* [[TMP19]], align 8, !tbaa [[TBAA9:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 0
-; CHECK-NEXT:    call void @g(%struct.Vec2r* nonnull [[ARRAYDECAY]])
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = load double, double* [[X]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP22]], 2.000000e+00
-; CHECK-NEXT:    [[X4:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT:    store double [[MUL]], double* [[X4]], align 8, !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[INDVARS_IV]], i32 1
-; CHECK-NEXT:    [[TMP23:%.*]] = load double, double* [[Y]], align 8, !tbaa [[TBAA8]]
-; CHECK-NEXT:    [[MUL7:%.*]] = fmul double [[TMP23]], 3.000000e+00
-; CHECK-NEXT:    [[Y10:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 [[INDVARS_IV]], i32 1
-; CHECK-NEXT:    store double [[MUL7]], double* [[Y10]], align 8, !tbaa [[TBAA14:![0-9]+]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
 {
 entry:
   %cp = alloca [20 x %struct.Vec2r], align 8
@@ -91,6 +30,9 @@ for.body:                                         ; preds = %for.body, %entry
   %x4 = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 %indvars.iv, i32 0
 
 ; The new store should alias any double rather than one of the fields of Vec2r.
+; CHECK: store <4 x double> {{.*}} !tbaa ![[STORE_TBAA:[0-9]+]]
+; CHECK-DAG: ![[DOUBLE_TBAA:[0-9]+]] = !{!"double", !{{[0-9+]}}, i64 0}
+; CHECK-DAG: ![[STORE_TBAA]] = !{![[DOUBLE_TBAA]], ![[DOUBLE_TBAA]], i64 0}
   store double %mul, double* %x4, align 8, !tbaa !8
   %y = getelementptr inbounds %struct.Vec4r, %struct.Vec4r* %p, i64 %indvars.iv, i32 1
   %2 = load double, double* %y, align 8, !tbaa !10

diff  --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
index a4c1574426e34..cee0231001173 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -1,11 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
 ; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 
-; We test here that the loop-vectorizer forms an interleave-groups from
+; We test here that the loop-vectorizer forms an interleave-groups from 
 ; predicated memory accesses only if they are both in the same (predicated)
 ; block (first scenario below).
 ; If the accesses are not in the same predicated block, an interleave-group
@@ -31,11 +30,11 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 ;}
 
 
-; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided1'
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided1' 
 ; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
 
-; STRIDED_MASKED: LV: Checking a loop in 'masked_strided1'
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided1' 
 ; STRIDED_MASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
 ; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
@@ -64,13 +63,13 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 ; }
 ;}
 
-; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided2'
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided2' 
 ; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
 ; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
 
-; STRIDED_MASKED: LV: Checking a loop in 'masked_strided2'
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided2' 
 ; STRIDED_MASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
@@ -98,11 +97,11 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 ;}
 
 
-; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided3'
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided3' 
 ; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
 
-; STRIDED_MASKED: LV: Checking a loop in 'masked_strided3'
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided3' 
 ; STRIDED_MASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1

diff  --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 61598563eb62e..54ccd7f070003 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -115,11 +115,11 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC6]], <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
@@ -354,7 +354,7 @@ define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6
@@ -362,19 +362,19 @@ define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE4]], [[VEC_IND1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 -4, i32 -4, i32 -4, i32 -4>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
@@ -759,12 +759,12 @@ define void @mixed_load3_store3(i32* nocapture %A) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC5]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

diff  --git a/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll
index 9ded4974a0b7b..3d3ef9e05935c 100644
--- a/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -1,5 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff  --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll
index 82b32324d7050..f032b5c277259 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-form.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll
@@ -59,28 +59,28 @@ define void @bottom_tested(i16* %p, i32 %n) {
 ; TAILFOLD:       vector.body:
 ; TAILFOLD-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; TAILFOLD-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
-; TAILFOLD-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; TAILFOLD-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64>
-; TAILFOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; TAILFOLD-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TAILFOLD-NEXT:    [[TMP3:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; TAILFOLD-NEXT:    [[TMP4:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64>
+; TAILFOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; TAILFOLD-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; TAILFOLD:       pred.store.if:
-; TAILFOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; TAILFOLD-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[TMP5]], align 4
+; TAILFOLD-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; TAILFOLD-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP6]]
+; TAILFOLD-NEXT:    store i16 0, i16* [[TMP7]], align 4
 ; TAILFOLD-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; TAILFOLD:       pred.store.continue:
-; TAILFOLD-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; TAILFOLD-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; TAILFOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; TAILFOLD-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; TAILFOLD:       pred.store.if1:
-; TAILFOLD-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; TAILFOLD-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP7]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[TMP8]], align 4
+; TAILFOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; TAILFOLD-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP9]]
+; TAILFOLD-NEXT:    store i16 0, i16* [[TMP10]], align 4
 ; TAILFOLD-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; TAILFOLD:       pred.store.continue2:
 ; TAILFOLD-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; TAILFOLD-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; TAILFOLD-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; TAILFOLD-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TAILFOLD-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; TAILFOLD-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TAILFOLD:       middle.block:
 ; TAILFOLD-NEXT:    br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]]
 ; TAILFOLD:       scalar.ph:
@@ -129,14 +129,14 @@ define void @early_exit(i16* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -263,14 +263,14 @@ define void @multiple_unique_exit(i16* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -345,14 +345,14 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -429,14 +429,14 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -514,14 +514,14 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -603,14 +603,14 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -697,16 +697,16 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i32> [[VEC_IND]], <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i32> [[VEC_IND]], <i32 1, i32 1>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1128,22 +1128,22 @@ define i32 @me_reduction(i32* %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP5]])
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -1152,8 +1152,8 @@ define i32 @me_reduction(i32* %addr) {
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP8]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400
 ; CHECK-NEXT:    br i1 [[EXITCOND2_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll b/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll
index 08ac6958549c2..d9efaa5c085fa 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; RUN: opt < %s -S -loop-vectorize -debug-only=loop-vectorize 2>&1 | FileCheck %s
@@ -6,24 +5,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: LV: Can't vectorize due to memory conflicts
 
 define void @test_loop_novect(double** %arr, i64 %n) {
-; CHECK-LABEL: @test_loop_novect(
-; CHECK-NEXT:  for.body.lr.ph:
-; CHECK-NEXT:    [[T:%.*]] = load double*, double** [[ARR:%.*]], align 8
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds double, double* [[T]], i64 [[I]]
-; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT:    [[A_NEXT:%.*]] = getelementptr inbounds double, double* [[T]], i64 [[I_NEXT]]
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[A]], align 8
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[A_NEXT]], align 8
-; CHECK-NEXT:    store double [[T1]], double* [[A_NEXT]], align 8
-; CHECK-NEXT:    store double [[T2]], double* [[A]], align 8
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[I]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[C]], label [[FINAL:%.*]], label [[FOR_BODY]]
-; CHECK:       final:
-; CHECK-NEXT:    ret void
-;
 for.body.lr.ph:
   %t = load double*, double** %arr, align 8
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
index fff95f2704a7d..8e007a63ba40d 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
@@ -67,7 +67,7 @@ define void @maxvf3() {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -82,7 +82,7 @@ define void @maxvf3() {
 ; CHECK-NEXT:    store i8 7, i8* [[AJP3]], align 8
 ; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i32 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll b/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll
index 54badc5e5d21f..7a2f044d6b741 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 ; Test cases to make sure LV & loop versioning can handle loops with
@@ -7,64 +6,12 @@
 ; Multiple branches exiting the loop to a unique exit block. The loop should
 ; be vectorized with versioning & noalias metadata should be added.
 define void @multiple_exits_unique_exit_block(i32* %A, i32* %B, i64 %N) {
-; CHECK-LABEL: @multiple_exits_unique_exit_block(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[UMIN6:%.*]] = call i64 @llvm.umin.i64(i64 [[N:%.*]], i64 999)
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN6]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-LABEL: @multiple_exits_unique_exit_block
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 999)
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP3]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[WIDE_LOAD]], <2 x i32>* [[TMP10]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT:    [[COND_0:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[COND_0]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[A_GEP]], align 4
-; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[LV]], i32* [[B_GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND_1:%.*]] = icmp ult i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: vector.body:
+; CHECK:         %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4, !alias.scope
+; CHECK:         store <2 x i32> %wide.load, <2 x i32>* {{.*}}, align 4, !alias.scope
+; CHECK:         br
 ;
 entry:
   br label %loop.header
@@ -90,66 +37,10 @@ exit:
 
 ; Multiple branches exiting the loop to 
diff erent blocks. Currently this is not supported.
 define i32 @multiple_exits_multiple_exit_blocks(i32* %A, i32* %B, i64 %N) {
-; CHECK-LABEL: @multiple_exits_multiple_exit_blocks(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[UMIN6:%.*]] = call i64 @llvm.umin.i64(i64 [[N:%.*]], i64 999)
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN6]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 999)
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP3]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4, !alias.scope !8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[WIDE_LOAD]], <2 x i32>* [[TMP10]], align 4, !alias.scope !11, !noalias !8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT:    [[COND_0:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[COND_0]], label [[EXIT_0:%.*]], label [[FOR_BODY]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[A_GEP]], align 4
-; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[LV]], i32* [[B_GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND_1:%.*]] = icmp ult i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND_1]], label [[LOOP_HEADER]], label [[EXIT_1:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       exit.0:
-; CHECK-NEXT:    ret i32 1
-; CHECK:       exit.1:
-; CHECK-NEXT:    ret i32 2
+; CHECK-LABEL: @multiple_exits_multiple_exit_blocks
+; CHECK-NEXT:    entry:
+; CHECK:           br label %loop.header
+; CHECK-NOT:      <2 x i32>
 ;
 entry:
   br label %loop.header

diff  --git a/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll b/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll
index 2b71a7f22fd99..a0c5345048228 100644
--- a/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll
+++ b/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll
@@ -14,7 +14,7 @@ define void @test1(i32 %n) #0 {
 ; CHECK:       .lr.ph:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]], !llvm.loop !0
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:

diff  --git a/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll b/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll
index 90badfd7400b4..c6a2431eba5ba 100644
--- a/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s
 
 ; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur.
@@ -19,47 +18,6 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind ssp uwtable
 define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 !dbg !4 {
-; CHECK-LABEL: @_Z4testPiS_i(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[NUMBER:%.*]], 0, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP25]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END15:%.*]], !dbg [[DBG8]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG12:![0-9]+]]
-; CHECK:       for.cond5.preheader:
-; CHECK-NEXT:    br i1 [[CMP25]], label [[FOR_BODY7_PREHEADER:%.*]], label [[FOR_END15]], !dbg [[DBG14:![0-9]+]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       for.body7.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY7:%.*]], !dbg [[DBG18:![0-9]+]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV27:%.*]] = phi i64 [ [[INDVARS_IV_NEXT28:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV27]], !dbg [[DBG12]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG12]], !tbaa [[TBAA20:![0-9]+]]
-; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP0]] to i64, !dbg [[DBG12]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM1]], !dbg [[DBG12]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !dbg [[DBG12]], !tbaa [[TBAA20]]
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1, !dbg [[DBG12]]
-; CHECK-NEXT:    store i32 [[INC]], i32* [[ARRAYIDX2]], align 4, !dbg [[DBG12]], !tbaa [[TBAA20]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT28]] = add nuw nsw i64 [[INDVARS_IV27]], 1, !dbg [[DBG8]]
-; CHECK-NEXT:    [[LFTR_WIDEIV29:%.*]] = trunc i64 [[INDVARS_IV_NEXT28]] to i32, !dbg [[DBG8]]
-; CHECK-NEXT:    [[EXITCOND30:%.*]] = icmp eq i32 [[LFTR_WIDEIV29]], [[NUMBER]], !dbg [[DBG8]]
-; CHECK-NEXT:    br i1 [[EXITCOND30]], label [[FOR_COND5_PREHEADER:%.*]], label [[FOR_BODY]], !dbg [[DBG8]], !llvm.loop [[LOOP10]]
-; CHECK:       for.body7:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY7]] ], [ 0, [[FOR_BODY7_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]], !dbg [[DBG18]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG18]], !tbaa [[TBAA20]]
-; CHECK-NEXT:    [[IDXPROM10:%.*]] = sext i32 [[TMP2]] to i64, !dbg [[DBG18]]
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IDXPROM10]], !dbg [[DBG18]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX11]], align 4, !dbg [[DBG18]], !tbaa [[TBAA20]]
-; CHECK-NEXT:    [[INC12:%.*]] = add nsw i32 [[TMP3]], 1, !dbg [[DBG18]]
-; CHECK-NEXT:    store i32 [[INC12]], i32* [[ARRAYIDX11]], align 4, !dbg [[DBG18]], !tbaa [[TBAA20]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG14]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG14]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[NUMBER]], !dbg [[DBG14]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY7]], !dbg [[DBG14]], !llvm.loop [[LOOP16]]
-; CHECK:       for.end15.loopexit:
-; CHECK-NEXT:    br label [[FOR_END15]]
-; CHECK:       for.end15:
-; CHECK-NEXT:    ret void, !dbg [[DBG24:![0-9]+]]
-;
 entry:
   %cmp25 = icmp sgt i32 %number, 0, !dbg !10
   br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12

diff  --git a/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll b/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll
index c47a433e919d7..bfa48a2529b97 100644
--- a/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll
@@ -1,30 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
 @a = common global [128 x i32] zeroinitializer, align 16
 
 ;; Must not vectorize division reduction. Division is lossy.
 define i32 @g() {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[R_05:%.*]] = phi i32 [ 80, [[ENTRY]] ], [ [[DIV:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x i32], [128 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[DIV]] = sdiv i32 [[R_05]], [[TMP0]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DIV_LCSSA:%.*]] = phi i32 [ [[DIV]], [[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[DIV_LCSSA]]
-;
 entry:
   br label %for.body
 
 for.body:
+  ; CHECK-LABEL: @g(
+  ; CHECK-NOT: sdiv <2 x i32>
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %r.05 = phi i32 [ 80, %entry ], [ %div, %for.body ]
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @a, i64 0, i64 %indvars.iv

diff  --git a/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll b/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll
index 5dccb2805f865..c343bb8dd6a32 100644
--- a/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll
+++ b/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -basic-aa -scoped-noalias-aa -loop-vectorize -licm -force-vector-width=2 \
 ; RUN:     -force-vector-interleave=1 -S < %s | FileCheck %s
 
@@ -16,85 +15,6 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 
 define void @f(i32* %a, i32* %b, i32* %c) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[C7:%.*]] = bitcast i32* [[C:%.*]] to i8*
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 20
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr i32, i32* [[C]], i64 20
-; CHECK-NEXT:    [[SCEVGEP89:%.*]] = bitcast i32* [[SCEVGEP8]] to i8*
-; CHECK-NEXT:    [[BOUND010:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP89]]
-; CHECK-NEXT:    [[BOUND111:%.*]] = icmp ult i8* [[C7]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
-; CHECK-NEXT:    br label [[OUTER:%.*]]
-; CHECK:       outer:
-; CHECK-NEXT:    [[I_2:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[INNER_END:%.*]] ]
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[I_2]]
-; CHECK-NEXT:    [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8*
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I_2]], 1
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_2]]
-; CHECK-NEXT:    br label [[INNER_PH:%.*]]
-; CHECK:       inner.ph:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP56]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !5
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP9:%.*]] = add nuw <2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nuw <2 x i32> [[TMP9]], [[WIDE_LOAD13]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP11]], align 4, !alias.scope !3, !noalias !5
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[INNER_END]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_PH]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[INNER:%.*]]
-; CHECK:       inner:
-; CHECK-NEXT:    [[J_2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J:%.*]], [[INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[J_2]]
-; CHECK-NEXT:    [[LOADA:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
-; CHECK-NEXT:    [[LOADB:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
-; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[J_2]]
-; CHECK-NEXT:    [[LOADC:%.*]] = load i32, i32* [[ARRAYIDXC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[LOADA]], [[LOADB]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw i32 [[ADD]], [[LOADC]]
-; CHECK-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDXA]], align 4
-; CHECK-NEXT:    [[J]] = add nuw nsw i64 [[J_2]], 1
-; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i64 [[J]], 20
-; CHECK-NEXT:    br i1 [[COND1]], label [[INNER_END_LOOPEXIT:%.*]], label [[INNER]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       inner.end.loopexit:
-; CHECK-NEXT:    br label [[INNER_END]]
-; CHECK:       inner.end:
-; CHECK-NEXT:    [[I]] = add nuw nsw i64 [[I_2]], 1
-; CHECK-NEXT:    [[COND2:%.*]] = icmp eq i64 [[I]], 30
-; CHECK-NEXT:    br i1 [[COND2]], label [[OUTER_END:%.*]], label [[OUTER]]
-; CHECK:       outer.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %outer
 
@@ -104,6 +24,9 @@ outer:
   br label %inner.ph
 
 inner.ph:
+; CHECK: vector.ph:
+; CHECK: load i32, i32* %arrayidxB,
+; CHECK: br label %vector.body
   br label %inner
 
 inner:

diff  --git a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
index 834abb9c571f5..140fc219c1d31 100644
--- a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
+++ b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
@@ -1,62 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-interleave=2  -S | FileCheck %s
 
 define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 1.000000e+02
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
@@ -80,80 +35,6 @@ declare void @llvm.experimental.noalias.scope.decl(metadata)
 %struct.data = type { float*, float* }
 
 define void @test2(%struct.data* nocapture readonly %d) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], %struct.data* [[D:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float*, float** [[B]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to i8*
-; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[TMP0]] to i64
-; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_DATA]], %struct.data* [[D]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load float*, float** [[A]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to i8*
-; CHECK-NEXT:    [[PTRINT2:%.*]] = ptrtoint float* [[TMP2]] to i64
-; CHECK-NEXT:    [[MASKEDPTR3:%.*]] = and i64 [[PTRINT2]], 31
-; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[TMP3]], [[SCEVGEP23]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <4 x float> [[WIDE_LOAD4]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP12]], <4 x float>* [[TMP17]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 4
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP13]], <4 x float>* [[TMP19]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10]])
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
   %0 = load float*, float** %b, align 8
@@ -167,6 +48,16 @@ entry:
   %maskcond4 = icmp eq i64 %maskedptr3, 0
   br label %for.body
 
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
@@ -186,85 +77,19 @@ for.end:                                          ; preds = %for.body
 }
 
 define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) {
+
 ; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl
+
 ; CHECK-LABEL: @predicated_noalias_scope_decl(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 495616, i64 495616, i64 495616, i64 495616>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <4 x i64> [[STEP_ADD]], <i64 495616, i64 495616, i64 495616, i64 495616>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 991232, i64 991232, i64 991232, i64 991232>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <4 x i64> [[STEP_ADD]], <i64 991232, i64 991232, i64 991232, i64 991232>
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> <float 2.300000e+01, float 2.300000e+01, float 2.300000e+01, float 2.300000e+01>, <4 x float> <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x float> <float 2.300000e+01, float 2.300000e+01, float 2.300000e+01, float 2.300000e+01>, <4 x float> <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <4 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <4 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 4
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP16]], <4 x float>* [[TMP22]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    br label [[IF_END5]]
-; CHECK:       if.end5:
-; CHECK-NEXT:    [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X_0]], [[TMP24]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
+; CHECK:   vector.body:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   scalar.ph:
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   if.else:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: }
 
 entry:
   %cmp15 = icmp eq i32 %n, 0
@@ -308,3 +133,8 @@ if.end5:                                          ; preds = %for.body, %if.else
 !3 = distinct !{ !3, !2 }
 !4 = !{ !3 }
 
+; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]}
+; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]}
+; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]}
+; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]}
+; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}

diff  --git a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
index 261321fe38bef..353fbe013f12f 100644
--- a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes="loop-vectorize,jump-threading" -debug-pass-manager < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -7,20 +6,26 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; vectorization happens, and the only change LV makes is LCSSA formation.
 
 define i32 @novect(i32* %p) {
-; CHECK-LABEL: @novect(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[MIDDLE:%.*]]
-; CHECK:       middle:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE]] ]
-; CHECK-NEXT:    [[X:%.*]] = load volatile i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[MIDDLE]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X]], [[MIDDLE]] ]
-; CHECK-NEXT:    ret i32 [[X_LCSSA]]
-;
 
+; CHECK:           Running pass: LoopVectorizePass on novect
+; CHECK:           Clearing all analysis results for: <possibly invalidated loop>
+; CHECK:           Invalidating analysis: ScalarEvolutionAnalysis on novect
+; CHECK-NOT:       Invalidating analysis: BranchProbabilityAnalysis on novect
+; CHECK-NOT:       Invalidating analysis: BlockFrequencyAnalysis on novect
+; CHECK:           Invalidating analysis: DemandedBitsAnalysis on novect
+; CHECK:           Running pass: JumpThreadingPass on novect
+
+; CHECK:           entry:
+; CHECK:             br label %middle
+; CHECK:           middle:
+; CHECK:             %iv = phi i32 [ 0, %entry ], [ %iv.next, %middle ]
+; CHECK:             %x = load volatile i32, i32* %p
+; CHECK:             %iv.next = add i32 %iv, 1
+; CHECK:             %cond = icmp slt i32 %iv, 1000
+; CHECK:             br i1 %cond, label %exit, label %middle
+; CHECK:           exit:
+; CHECK:             %x.lcssa = phi i32 [ %x, %middle ]
+; CHECK:             ret i32 %x.lcssa
 
 entry:
   br label %middle

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
index 5d177833f9ae0..1756a44ab5132 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
@@ -40,7 +40,7 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
@@ -55,24 +55,24 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF4]]
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX6]], 0
 ; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX6]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
 ; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -85,7 +85,7 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]]
 ; VF-TWO-CHECK:       for.end.loopexit.loopexit:
 ; VF-TWO-CHECK-NEXT:    [[ADD_LCSSA3:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
index a17b368f2a46f..f3d307991048c 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt < %s  -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s
 
@@ -10,70 +9,6 @@ target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
 ; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1
 
 define void @f1(i8* %A) {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <vscale x 4 x i8>*
-; CHECK-NEXT:    store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i8>*
-; CHECK-NEXT:    store <2 x i8> <i8 1, i8 1>, <2 x i8>* [[TMP14]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
-; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i8 1, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       exit.loopexit:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 12e954188a01a..7dfc9969d032e 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -60,27 +60,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX5]], 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP19]], <4 x float>* [[TMP22]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]]
-; CHECK-NEXT:    br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -171,12 +171,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -187,30 +187,30 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], -1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[N]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 -3
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP16]], align 4
-; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[N]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 -3
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP17]], <4 x float>* [[TMP20]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP20]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP23]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP24]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -221,43 +221,43 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC3]] to i32
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX8:%.*]] = trunc i64 [[OFFSET_IDX9]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX8]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX9]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP22]], -1
-; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP24]], [[N]]
-; CHECK-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP27]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP28]], i32 -3
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP30]], align 4
-; CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP31:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX9:%.*]] = trunc i64 [[INDEX4]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX9]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX4]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP26]], -1
+; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP28]], [[N]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP31]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 -3
 ; CHECK-NEXT:    [[TMP34:%.*]] = bitcast float* [[TMP33]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP31]], <4 x float>* [[TMP34]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[OFFSET_IDX9]], 4
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP34]], align 4
+; CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP35:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float* [[TMP37]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP35]], <4 x float>* [[TMP38]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 4
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END7]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[I_014]], -1
-; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP36]], [[N]]
+; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP40:%.*]] = xor i32 [[I_014]], -1
+; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP40]], [[N]]
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP37]], 1.000000e+00
+; CHECK-NEXT:    [[TMP41:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP41]], 1.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -366,18 +366,18 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX4]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
 ; CHECK-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, <4 x i8>* [[TMP8]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -429,18 +429,18 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.vector.body:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX4]], 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> <i8 1, i8 1>, <2 x i8>* [[TMP8]], align 1
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.middle.block:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.scalar.ph:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]

diff  --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
index e130643c4c2d9..d1bc8687996cc 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
@@ -45,8 +45,8 @@ define void @test([2000 x i32]* %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -75,7 +75,7 @@ define void @test([2000 x i32]* %src, i64 %n) {
 ; CHECK:       loop.1.latch:
 ; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index cde46b8f6f938..cc604130005d5 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -146,22 +146,22 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to <4 x i8*>*
-; CHECK-NEXT:    store <4 x i8*> [[TMP2]], <4 x i8*>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8*> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP8]], <4 x i8>* [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to <4 x i8*>*
+; CHECK-NEXT:    store <4 x i8*> [[TMP3]], <4 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8*> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP9]], <4 x i8>* [[TMP10]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
index cea268958372b..d5625d615b492 100644
--- a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -51,19 +51,19 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i1 [[TMP8]], i1 [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP10]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i8 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[IDENT_CHECK]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
@@ -71,24 +71,24 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[STEP]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -176,18 +176,18 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i8 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
-; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP13]], [[IDENT_CHECK]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 false
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
@@ -195,24 +195,24 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[STEP]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -374,16 +374,16 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP3]], i8 [[TMP4]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i8 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i1 [[TMP7]], i1 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP2]], i1 [[TMP8]], i1 [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[CSTEP]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
@@ -391,24 +391,24 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[CONV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[CONV]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[CONV]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[CONV]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll b/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll
index 620faae45d64a..e04bdd6607d79 100644
--- a/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -indvars < %s | FileCheck %s
 
 ; Produced from the test-case:
@@ -26,34 +25,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @theSize = external local_unnamed_addr global i32, align 4
 
 define void @foo(i8* %buf, i32 %denominator, i32* %flag) local_unnamed_addr {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @theSize, align 4
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP0]], [[DENOMINATOR:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FLAG:%.*]], align 4
-; CHECK-NEXT:    [[TOBOOL5:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL5]], label [[WHILE_END:%.*]], label [[WHILE_BODY_LR_PH:%.*]]
-; CHECK:       while.body.lr.ph:
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_LR_PH]] ]
-; CHECK-NEXT:    [[BUF_ADDR_07:%.*]] = phi i8* [ [[BUF:%.*]], [[WHILE_BODY_LR_PH]] ], [ [[CALL:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[DIV]] to i64
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* @theSize, align 4
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[I]], align 4
-; CHECK-NEXT:    call void @bar(i32* nonnull [[I]], i64 [[INDVARS_IV_NEXT]])
-; CHECK-NEXT:    [[CALL]] = call i8* @processBuf(i8* [[BUF_ADDR_07]])
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[FLAG]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[I]] to i8*
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   %i = alloca i32, align 4
   %0 = load i32, i32* @theSize, align 4
@@ -69,6 +40,9 @@ while.body.lr.ph:                                 ; preds = %entry
 while.body:                                       ; preds = %while.body.lr.ph, %while.body
 ; Check that there are two PHIs followed by a 'sext' in the same block, and that
 ; the test does not crash.
+; CHECK:        phi
+; CHECK-NEXT:   phi
+; CHECK-NEXT:   sext
   %buf.addr.07 = phi i8* [ %buf, %while.body.lr.ph ], [ %call, %while.body ]
   %inx.06 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
   %add = add nsw i32 %inx.06, %div

diff  --git a/llvm/test/Transforms/LoopVectorize/pr30806.ll b/llvm/test/Transforms/LoopVectorize/pr30806.ll
index 4708ddf50c47d..dd9f66294137d 100644
--- a/llvm/test/Transforms/LoopVectorize/pr30806.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr30806.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
 
 ; Produced from test-case:
@@ -20,66 +19,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @testGuardedInnerLoop(i32* %ptr, i32 %denom, i32 %numer, i32 %outer_lim) {
-; CHECK-LABEL: @testGuardedInnerLoop(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[OUTER_LIM:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[EXIT:%.*]], label [[LOOP1_PREHEADER:%.*]]
-; CHECK:       loop1.preheader:
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
-; CHECK:       loop1:
-; CHECK-NEXT:    [[OUTER_I:%.*]] = phi i32 [ [[INC1:%.*]], [[LOOP2_EXIT:%.*]] ], [ 0, [[LOOP1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[DENOM:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], [[NUMER:%.*]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP2_PREHEADER:%.*]], label [[LOOP2_EXIT]]
-; CHECK:       loop2.preheader:
-; CHECK-NEXT:    [[LIM:%.*]] = udiv i32 [[NUMER]], [[DENOM]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[LIM]] to i64
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[LOOP2:%.*]]
-; CHECK:       loop2:
-; CHECK-NEXT:    [[INDVAR_LOOP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR_LOOP2_NEXT:%.*]], [[LOOP2]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVAR_LOOP2]]
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVAR_LOOP2_NEXT]] = add nuw nsw i64 [[INDVAR_LOOP2]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVAR_LOOP2_NEXT]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP2]], label [[LOOP2_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       loop2.exit.loopexit:
-; CHECK-NEXT:    br label [[LOOP2_EXIT]]
-; CHECK:       loop2.exit:
-; CHECK-NEXT:    [[INC1]] = add nuw i32 [[OUTER_I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], [[OUTER_LIM]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP1]]
-; CHECK:       exit.loopexit:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   %cmp1 = icmp eq i32 %outer_lim, 0
   br i1 %cmp1, label %exit, label %loop1.preheader
@@ -87,6 +26,9 @@ entry:
 ; Verify that a 'udiv' does not appear between the 'loop1.preheader' label, and
 ; whatever label comes next.
 loop1.preheader:
+; CHECK-LABEL: loop1.preheader:
+; CHECK-NOT: udiv
+; CHECK-LABEL: :
   br label %loop1
 
 loop1:
@@ -98,6 +40,9 @@ loop1:
 ; Verify that a 'udiv' does appear between the 'loop2.preheader' label, and
 ; whatever label comes next.
 loop2.preheader:
+; CHECK-LABEL: loop2.preheader:
+; CHECK: udiv
+; CHECK-LABEL: :
   %lim = udiv i32 %numer, %denom
   %2 = zext i32 %lim to i64
   br label %loop2

diff  --git a/llvm/test/Transforms/LoopVectorize/pr35743.ll b/llvm/test/Transforms/LoopVectorize/pr35743.ll
index 07729397fa5a7..7dc67e4a9b6bc 100644
--- a/llvm/test/Transforms/LoopVectorize/pr35743.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr35743.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
@@ -6,21 +5,10 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; This cannot be correctly vectorized with type i1.
 define i8 @test_01(i8 %c) #0 {
+
 ; CHECK-LABEL: @test_01(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[ACCUM_PLUS_LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    ret i8 [[ACCUM_PLUS_LCSSA]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi i8 [ [[C:%.*]], [[ENTRY:%.*]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT:    [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-;
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   zext i1 {{.*}} to i8
 
 entry:
   br label %loop
@@ -40,21 +28,9 @@ loop:                                            ; preds = %loop, %entry
 
 ; TODO: This can be vectorized with type i1 because the result is not used.
 define void @test_02(i8 %c) #0 {
+
 ; CHECK-LABEL: @test_02(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    ret void
-; CHECK:       loop:
-; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi i8 [ [[C:%.*]], [[ENTRY:%.*]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT:    [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-;
+; CHECK-NOT:   vector.body:
 
 entry:
   br label %loop
@@ -75,80 +51,10 @@ loop:                                            ; preds = %loop, %entry
 
 ; This can be vectorized with type i1 because the result is truncated properly.
 define i1 @test_03(i8 %c) #0 {
+
 ; CHECK-LABEL: @test_03(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i8> zeroinitializer, i8 [[C:%.*]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i8> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[VEC_PHI]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i8> [[VEC_PHI1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP1]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[TMP2]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 192
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i8> [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7]] = zext <16 x i1> [[TMP6]] to <16 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i8> [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9]] = zext <16 x i1> [[TMP8]] to <16 x i8>
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i8> [[TMP7]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <16 x i8> [[TMP9]] to <16 x i1>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> [[BIN_RDX]])
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i8
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 192, 192
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[C]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i8> zeroinitializer, i8 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <8 x i8> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i8> [[VEC_PHI4]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP16:%.*]] = add <8 x i8> [[TMP15]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i32 [[INDEX3]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 192
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <8 x i8> [[TMP16]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19]] = zext <8 x i1> [[TMP18]] to <8 x i8>
-; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc <8 x i8> [[TMP19]] to <8 x i1>
-; CHECK-NEXT:    [[TMP21:%.*]] = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> [[TMP20]])
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i1 [[TMP21]] to i8
-; CHECK-NEXT:    [[CMP_N2:%.*]] = icmp eq i32 192, 192
-; CHECK-NEXT:    br i1 [[CMP_N2]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 193, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 193, [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX6:%.*]] = phi i8 [ [[C]], [[ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       exit.loopexit:
-; CHECK-NEXT:    [[ACCUM_PLUS_LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ], [ [[TMP22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi i8 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[ACCUM_PLUS_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[LCSSA]] to i1
-; CHECK-NEXT:    ret i1 [[TRUNC]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT:    [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK:   vector.body:
+; CHECK:   zext i1 {{.*}} to i8
 
 entry:
   br label %loop
@@ -172,22 +78,10 @@ loop:                                            ; preds = %loop, %entry
 ; wrong type.
 ; TODO: It can also be vectorized with type i32 (or maybe i4?)
 define i4 @test_04(i8 %c) #0 {
+
 ; CHECK-LABEL: @test_04(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[LCSSA]] to i4
-; CHECK-NEXT:    ret i4 [[TRUNC]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi i8 [ [[C:%.*]], [[ENTRY:%.*]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT:    [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-;
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   zext i1 {{.*}} to i8
 
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/pr35773.ll b/llvm/test/Transforms/LoopVectorize/pr35773.ll
index 96f805f703eb9..a4a72594cb38b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr35773.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr35773.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -6,59 +5,27 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @doit1(i32* %ptr) {
 ; CHECK-LABEL: @doit1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1]] = add <4 x i8> [[VEC_PHI]], [[VEC_IND2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 36, i32 36, i32 36, i32 36>
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i8> [[VEC_IND2]], <i8 36, i8 36, i8 36, i8 36>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP1]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 16, 16
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 144, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[MAIN_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I8_IV:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[I8_ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I32_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[I32_ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TRUNC_TO_BE_CONVERTED_TO_NEW_IV:%.*]] = trunc i32 [[I32_IV]] to i8
-; CHECK-NEXT:    [[I8_ADD]] = add i8 [[I8_IV]], [[TRUNC_TO_BE_CONVERTED_TO_NEW_IV]]
-; CHECK-NEXT:    [[PTR_GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[MAIN_IV]]
-; CHECK-NEXT:    store i32 [[I32_IV]], i32* [[PTR_GEP]], align 4
-; CHECK-NEXT:    [[NOOP_CONV_UNDER_PSE:%.*]] = and i32 [[I32_IV]], 255
-; CHECK-NEXT:    [[I32_ADD]] = add nuw nsw i32 [[NOOP_CONV_UNDER_PSE]], 9
-; CHECK-NEXT:    [[INC]] = add i32 [[MAIN_IV]], 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[INC]], 16
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_COND_FOR_END_CRIT_EDGE]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    [[I8_ADD_LCSSA:%.*]] = phi i8 [ [[I8_ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    store i8 [[I8_ADD_LCSSA]], i8* @b, align 1
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[MAIN_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[MAIN_IV_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[I8_IV:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[I8_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I32_IV:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ]
 
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0
 
+; CHECK-NEXT:    [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
 
+; CHECK-NEXT:    [[GEP1:%.+]] = getelementptr inbounds i32, i32* %ptr, i32 [[TMP7]]
+; CHECK-NEXT:    [[GEP2:%.+]] = getelementptr inbounds i32, i32* [[GEP1]], i32 0
+; CHECK-NEXT:    [[GEP_BC:%.+]] = bitcast i32* [[GEP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[I32_IV]], <4 x i32>* [[GEP_BC]], align 4
 
+; CHECK-NEXT:    [[MAIN_IV_NEXT]] = add nuw i32 [[MAIN_IV]], 4
+; CHECK-NEXT:    [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], <i32 36, i32 36, i32 36, i32 36>
+; CHECK-NEXT:    [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], <i8 36, i8 36, i8 36, i8 36>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/pr37515.ll b/llvm/test/Transforms/LoopVectorize/pr37515.ll
index dd82dd6f3e3cb..b09e11fe15e0e 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37515.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37515.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes='loop-vectorize' -S -pass-remarks-missed=loop-vectorize < %s 2>&1 | FileCheck %s
 ;
 ; FP primary induction is not supported in LV. Make sure Legal bails out.
@@ -6,18 +5,6 @@
 ; CHECK: loop not vectorized
 
 define void @PR37515() {
-; CHECK-LABEL: @PR37515(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[P:%.*]] = phi float [ 1.900000e+01, [[ENTRY:%.*]] ], [ [[A:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A]] = fadd fast float [[P]], -1.000000e+00
-; CHECK-NEXT:    [[M:%.*]] = fmul fast float [[A]], [[A]]
-; CHECK-NEXT:    [[C:%.*]] = fcmp fast ugt float [[A]], 2.000000e+00
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    unreachable
-;
 entry:
   br label %loop
 

diff  --git a/llvm/test/Transforms/LoopVectorize/pr38697.ll b/llvm/test/Transforms/LoopVectorize/pr38697.ll
index 1979cb20c5de6..69dcdb08ece69 100644
--- a/llvm/test/Transforms/LoopVectorize/pr38697.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr38697.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=2 -S < %s 2>&1 | FileCheck %s
 ; RUN: opt -indvars -S < %s 2>&1 | FileCheck %s -check-prefix=INDVARCHECK
 
@@ -34,146 +33,15 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @testCountIncrLoop(i8* %ptr, i32 %lim, i32 %count, i32 %val) mustprogress {
 ; CHECK-LABEL: @testCountIncrLoop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[LIM:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP1_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK:       loop1.preheader:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
-; CHECK-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[COUNT]], 8
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[COUNT]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[TMP0]]
-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[SMAX]], [[UMIN]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    br label [[LOOP1_BODY:%.*]]
+; CHECK-NOT:     udiv
 ; CHECK:       loop1.body:
-; CHECK-NEXT:    [[OUTER_I:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[OUTER_I_1:%.*]], [[LOOP1_INC:%.*]] ]
-; CHECK-NEXT:    [[INX_1:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[INX_2:%.*]], [[LOOP1_INC]] ]
-; CHECK-NEXT:    br i1 [[CMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[LOOP1_INC]]
 ; CHECK:       while.cond.preheader:
-; CHECK-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
 ; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], [[COUNT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[UMIN]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[TMP0:%.*]], [[COUNT:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP6]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[N_VEC]], [[COUNT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[COUNT]], [[TMP7]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[COUNT]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[COUNT]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT2]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[COUNT]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT7]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = ashr <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP12:%.*]] = ashr <2 x i32> [[BROADCAST_SPLAT8]], [[STEP_ADD]]
-; CHECK-NEXT:    [[TMP13]] = add <2 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP14]] = add <2 x i32> [[TMP12]], [[VEC_PHI6]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[COUNT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[VAL]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[ADD3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RESULT_1:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[VAL]], [[TMP]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SHR]], [[RESULT_1]]
-; CHECK-NEXT:    [[ADD3]] = add nsw i32 [[TMP]], [[COUNT]]
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[ADD3]], 8
-; CHECK-NEXT:    br i1 [[CMP3]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[VAL]], [[WHILE_COND_PREHEADER]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[RESULT_0_LCSSA]] to i8
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[INX_1]], 1
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INX_1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    store i8 [[CONV]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    br label [[LOOP1_INC]]
-; CHECK:       loop1.inc:
-; CHECK-NEXT:    [[INX_2]] = phi i32 [ [[INC]], [[WHILE_END]] ], [ [[INX_1]], [[LOOP1_BODY]] ]
-; CHECK-NEXT:    [[OUTER_I_1]] = add nuw nsw i32 [[OUTER_I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[OUTER_I_1]], [[LIM]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP1_BODY]]
-; CHECK:       exit.loopexit:
-; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-; INDVARCHECK-LABEL: @testCountIncrLoop(
-; INDVARCHECK-NEXT:  entry:
-; INDVARCHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[LIM:%.*]], 0
-; INDVARCHECK-NEXT:    br i1 [[CMP1]], label [[LOOP1_PREHEADER:%.*]], label [[EXIT:%.*]]
-; INDVARCHECK:       loop1.preheader:
-; INDVARCHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
-; INDVARCHECK-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[COUNT]], 8
-; INDVARCHECK-NEXT:    br label [[LOOP1_BODY:%.*]]
-; INDVARCHECK:       loop1.body:
-; INDVARCHECK-NEXT:    [[OUTER_I:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[OUTER_I_1:%.*]], [[LOOP1_INC:%.*]] ]
-; INDVARCHECK-NEXT:    [[INX_1:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[INX_2:%.*]], [[LOOP1_INC]] ]
-; INDVARCHECK-NEXT:    br i1 [[CMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[LOOP1_INC]]
-; INDVARCHECK:       while.cond.preheader:
-; INDVARCHECK-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; INDVARCHECK:       while.body.preheader:
-; INDVARCHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; INDVARCHECK:       while.body:
-; INDVARCHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[ADD3:%.*]], [[WHILE_BODY]] ], [ [[COUNT]], [[WHILE_BODY_PREHEADER]] ]
-; INDVARCHECK-NEXT:    [[RESULT_1:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[VAL:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; INDVARCHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[VAL]], [[TMP]]
-; INDVARCHECK-NEXT:    [[ADD]] = add nsw i32 [[SHR]], [[RESULT_1]]
-; INDVARCHECK-NEXT:    [[ADD3]] = add nsw i32 [[TMP]], [[COUNT]]
-; INDVARCHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[ADD3]], 8
-; INDVARCHECK-NEXT:    br i1 [[CMP3]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT:%.*]]
-; INDVARCHECK:       while.end.loopexit:
-; INDVARCHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; INDVARCHECK-NEXT:    br label [[WHILE_END]]
-; INDVARCHECK:       while.end:
-; INDVARCHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[VAL]], [[WHILE_COND_PREHEADER]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; INDVARCHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[RESULT_0_LCSSA]] to i8
-; INDVARCHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[INX_1]], 1
-; INDVARCHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INX_1]] to i64
-; INDVARCHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[IDXPROM]]
-; INDVARCHECK-NEXT:    store i8 [[CONV]], i8* [[ARRAYIDX]], align 1
-; INDVARCHECK-NEXT:    br label [[LOOP1_INC]]
-; INDVARCHECK:       loop1.inc:
-; INDVARCHECK-NEXT:    [[INX_2]] = phi i32 [ [[INC]], [[WHILE_END]] ], [ [[INX_1]], [[LOOP1_BODY]] ]
-; INDVARCHECK-NEXT:    [[OUTER_I_1]] = add nuw nsw i32 [[OUTER_I]], 1
-; INDVARCHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[OUTER_I_1]], [[LIM]]
-; INDVARCHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP1_BODY]]
-; INDVARCHECK:       exit.loopexit:
-; INDVARCHECK-NEXT:    br label [[EXIT]]
-; INDVARCHECK:       exit:
-; INDVARCHECK-NEXT:    ret void
+; CHECK:         ret void
 ;
 entry:
   %cmp1 = icmp sgt i32 %lim, 0
@@ -247,147 +115,14 @@ exit:                                             ; preds = %loop1.inc, %entry
 ;
 ; Verify that the 'udiv' is hoisted to the preheader, and is not in the loop body.
 define i32 @NonZeroDivHoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @NonZeroDivHoist(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT:    [[COUNTER1_0:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 0, [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 1, [[TMP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP23]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP27]] = add <2 x i32> [[TMP26]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP28:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP27]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP28]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; CHECK:       for.body3:
-; CHECK-NEXT:    [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT:    [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT:    br label [[FOR_COND]]
-; CHECK:       for.end10:
-; CHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
-;
 ; INDVARCHECK-LABEL: @NonZeroDivHoist(
 ; INDVARCHECK-NEXT:  entry:
-; INDVARCHECK-NEXT:    br label [[FOR_COND:%.*]]
-; INDVARCHECK:       for.cond:
-; INDVARCHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT:    [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; INDVARCHECK-NEXT:    br i1 [[EXITCOND4]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; INDVARCHECK:       for.body:
-; INDVARCHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
 ; INDVARCHECK:       for.body3.lr.ph:
-; INDVARCHECK-NEXT:    [[TMP0:%.*]] = udiv i64 16, [[INDVARS_IV]]
+; INDVARCHECK-NEXT:    [[TMP0:%.*]] = udiv i64 16, [[INDVARS_IV:%.*]]
 ; INDVARCHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; INDVARCHECK:       for.body3:
-; INDVARCHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP0]]
-; INDVARCHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT:    [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; INDVARCHECK:       for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    br label [[FOR_END]]
-; INDVARCHECK:       for.end:
-; INDVARCHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT:    br label [[FOR_COND]]
+; INDVARCHECK-NOT:     udiv
 ; INDVARCHECK:       for.end10:
-; INDVARCHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
 ;
 entry:
   br label %for.cond
@@ -443,149 +178,12 @@ for.end10:                                        ; preds = %for.cond
 ; Verify that the 'udiv' is not hoisted to the preheader, and it remains in the
 ; loop body.
 define i32 @ZeroDivNoHoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @ZeroDivNoHoist(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT:    [[COUNTER1_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 0, [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 1, [[TMP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP23]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP27]] = add <2 x i32> [[TMP26]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP28:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP27]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP28]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; CHECK:       for.body3:
-; CHECK-NEXT:    [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT:    [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT:    [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT:    br label [[FOR_COND]]
-; CHECK:       for.end10:
-; CHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
-;
 ; INDVARCHECK-LABEL: @ZeroDivNoHoist(
 ; INDVARCHECK-NEXT:  entry:
-; INDVARCHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START1:%.*]] to i64
-; INDVARCHECK-NEXT:    br label [[FOR_COND:%.*]]
-; INDVARCHECK:       for.cond:
-; INDVARCHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT:    [[INDVARS3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; INDVARCHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INDVARS3]], 100
-; INDVARCHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; INDVARCHECK:       for.body:
-; INDVARCHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; INDVARCHECK:       for.body3.lr.ph:
-; INDVARCHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; INDVARCHECK-NOT:     udiv
 ; INDVARCHECK:       for.body3:
-; INDVARCHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = udiv i64 16, [[INDVARS_IV]]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP1]]
-; INDVARCHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT:    [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
+; INDVARCHECK:         [[TMP1:%.*]] = udiv i64 16, [[INDVARS_IV:%.*]]
 ; INDVARCHECK:       for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    br label [[FOR_END]]
-; INDVARCHECK:       for.end:
-; INDVARCHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT:    br label [[FOR_COND]]
-; INDVARCHECK:       for.end10:
-; INDVARCHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
 ;
 entry:
   br label %for.cond
@@ -640,147 +238,14 @@ for.end10:                                        ; preds = %for.cond
 ; Verify that the division-operation is hoisted, and that it appears as a
 ; right-shift ('lshr') rather than an explicit division.
 define i32 @DivBy16Hoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @DivBy16Hoist(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT:    [[COUNTER1_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[COUNTER1_0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[COUNTER1_0]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP2]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 0, [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[COUNTER1_0]], 16
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 1, [[TMP1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = udiv i32 [[COUNTER1_0]], 16
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP22]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP25]] = add <2 x i32> [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP25]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP26]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; CHECK:       for.body3:
-; CHECK-NEXT:    [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[COUNTER1_0]], 16
-; CHECK-NEXT:    [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT:    [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT:    br label [[FOR_COND]]
-; CHECK:       for.end10:
-; CHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
-;
 ; INDVARCHECK-LABEL: @DivBy16Hoist(
 ; INDVARCHECK-NEXT:  entry:
-; INDVARCHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START1:%.*]] to i64
-; INDVARCHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; INDVARCHECK:       for.cond:
-; INDVARCHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT:    [[INDVARS3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[INDVARS_IV]], 4
-; INDVARCHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INDVARS3]], 100
-; INDVARCHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
+; INDVARCHECK:         [[TMP1:%.*]] = lshr i64 [[INDVARS_IV:%.*]], 4
 ; INDVARCHECK:       for.body:
-; INDVARCHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; INDVARCHECK:       for.body3.lr.ph:
-; INDVARCHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; INDVARCHECK:       for.body3:
-; INDVARCHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP1]]
-; INDVARCHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT:    [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; INDVARCHECK:       for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    br label [[FOR_END]]
-; INDVARCHECK:       for.end:
-; INDVARCHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT:    br label [[FOR_COND]]
+; INDVARCHECK-NOT:     lshr
+; INDVARCHECK-NOT:     udiv
 ; INDVARCHECK:       for.end10:
-; INDVARCHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
 ;
 entry:
   br label %for.cond
@@ -834,147 +299,13 @@ for.end10:                                        ; preds = %for.cond
 ;
 ; Verify that the division-operation is hoisted.
 define i32 @DivBy17Hoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @DivBy17Hoist(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT:    [[COUNTER1_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP2]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 0, [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 1, [[TMP1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP22]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP25]] = add <2 x i32> [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP25]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP26]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; CHECK:       for.body3:
-; CHECK-NEXT:    [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT:    [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT:    [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT:    br label [[FOR_COND]]
-; CHECK:       for.end10:
-; CHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
-;
 ; INDVARCHECK-LABEL: @DivBy17Hoist(
 ; INDVARCHECK-NEXT:  entry:
-; INDVARCHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START1:%.*]] to i64
-; INDVARCHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; INDVARCHECK:       for.cond:
-; INDVARCHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT:    [[INDVARS3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[INDVARS_IV]], 17
-; INDVARCHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INDVARS3]], 100
-; INDVARCHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
+; INDVARCHECK:         [[TMP1:%.*]] = udiv i64 [[INDVARS_IV:%.*]], 17
 ; INDVARCHECK:       for.body:
-; INDVARCHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT:    [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT:    br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; INDVARCHECK:       for.body3.lr.ph:
-; INDVARCHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; INDVARCHECK:       for.body3:
-; INDVARCHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT:    [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP1]]
-; INDVARCHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT:    [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT:    [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; INDVARCHECK:       for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT:    br label [[FOR_END]]
-; INDVARCHECK:       for.end:
-; INDVARCHECK-NEXT:    [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT:    br label [[FOR_COND]]
+; INDVARCHECK-NOT:     udiv
 ; INDVARCHECK:       for.end10:
-; INDVARCHECK-NEXT:    [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT:    ret i32 [[VAL_0_LCSSA]]
 ;
 entry:
   br label %for.cond

diff  --git a/llvm/test/Transforms/LoopVectorize/pr38800.ll b/llvm/test/Transforms/LoopVectorize/pr38800.ll
index f8a4f70f71668..d3e937b5b7613 100755
--- a/llvm/test/Transforms/LoopVectorize/pr38800.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr38800.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=2 -pass-remarks-missed='loop-vectorize' -S < %s 2>&1 | FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: loop not vectorized: integer loop induction variable could not be identified
@@ -17,22 +16,6 @@
 ;}
 
 define void @foo(float* nocapture %ptr, float %val) local_unnamed_addr {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load float, float* [[PTR:%.*]], align 4
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[ADD5:%.*]] = phi float [ [[PTR_PROMOTED]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[F_04:%.*]] = phi float [ 0x3FB99999A0000000, [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[ADD5]], [[VAL:%.*]]
-; CHECK-NEXT:    [[ADD1]] = fadd fast float [[F_04]], 0x3F847AE140000000
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[ADD1]], 1.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store float [[ADD_LCSSA]], float* [[PTR]], align 4
-; CHECK-NEXT:    ret void
-;
 entry:
   %ptr.promoted = load float, float* %ptr, align 4
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/pr39099.ll b/llvm/test/Transforms/LoopVectorize/pr39099.ll
index 657fd9b43ca9b..0ba0be99c0b09 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39099.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39099.ll
@@ -1,11 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 
 ; Ensure that we don't create interleave groups for predicated
-; strided accesses.
+; strided accesses. 
 
 ; CHECK: LV: Checking a loop in 'masked_strided'
 ; CHECK: LV: Analyzing interleaved accesses...

diff  --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
index ac8afcc64bc8a..1b87ca914b93c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
@@ -15,36 +15,36 @@ define i16 @test_true_and_false_branch_equal() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* @v_38, align 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i16> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE4:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* @v_38, align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT1]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i16> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
 ; CHECK:       pred.srem.if:
-; CHECK-NEXT:    [[TMP4:%.*]] = srem i16 5786, [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = srem i16 5786, [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE]]
 ; CHECK:       pred.srem.continue:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_SREM_IF]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SREM_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_SREM_IF3:%.*]], label [[PRED_SREM_CONTINUE4]]
 ; CHECK:       pred.srem.if1:
-; CHECK-NEXT:    [[TMP8:%.*]] = srem i16 5786, [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i16> [[TMP6]], i16 [[TMP8]], i32 1
-; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = srem i16 5786, [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE4]]
 ; CHECK:       pred.srem.continue2:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP6]], [[PRED_SREM_CONTINUE]] ], [ [[TMP9]], [[PRED_SREM_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store i16 [[TMP11]], i16* @v_39, align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store i16 [[TMP12]], i16* @v_39, align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_SREM_CONTINUE]] ], [ [[TMP12]], [[PRED_SREM_IF3]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT:    store i16 [[TMP14]], i16* @v_39, align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    store i16 [[TMP15]], i16* @v_39, align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 12, 12
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -67,7 +67,7 @@ define i16 @test_true_and_false_branch_equal() {
 ; CHECK-NEXT:    store i16 [[COND6]], i16* @v_39, align 1
 ; CHECK-NEXT:    [[INC7]] = add nsw i16 [[I_07]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[INC7]], 111
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RV:%.*]] = load i16, i16* @v_39, align 1
 ; CHECK-NEXT:    ret i16 [[RV]]

diff  --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index 5b7e4c507db6b..04cae22d233dc 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -25,10 +25,10 @@ define i8 @widget(i8* %arr, i8 %t9) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[ARR2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i8
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i8 1, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i8 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP4]], 255
-; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i8 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
@@ -39,18 +39,18 @@ define i8 @widget(i8* %arr, i8 %t9) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i8> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i8> [[TMP10]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP14:%.*]] = zext <4 x i1> [[TMP13]] to <4 x i8>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP14]], <4 x i8>* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i8> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt <4 x i8> [[TMP14]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i1> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP18]], <4 x i8>* [[TMP20]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/pr45525.ll b/llvm/test/Transforms/LoopVectorize/pr45525.ll
index 921a29ba70449..9090599ecf48f 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45525.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45525.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
 
 ; Test case for PR45525. Checks that phi's with a single predecessor and a mask are supported.
@@ -8,46 +7,10 @@ define void @main(i1 %cond, i32* %arr) {
 ; CHECK-NEXT:  bb.0:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:         br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB_4:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[BB_0:%.*]] ]
-; CHECK-NEXT:    br label [[BB_1:%.*]]
-; CHECK:       bb.1:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[BB_3:%.*]] ]
-; CHECK-NEXT:    br i1 [[COND]], label [[BB_3]], label [[BB_2:%.*]]
-; CHECK:       bb.2:
-; CHECK-NEXT:    [[SINGLE_PRED:%.*]] = phi i32 [ [[IV]], [[BB_1]] ]
-; CHECK-NEXT:    [[MULT:%.*]] = mul i32 [[SINGLE_PRED]], 3
-; CHECK-NEXT:    br label [[BB_3]]
-; CHECK:       bb.3:
-; CHECK-NEXT:    [[STORED_VALUE:%.*]] = phi i32 [ 7, [[BB_1]] ], [ [[MULT]], [[BB_2]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[IV]]
-; CHECK-NEXT:    store i32 [[STORED_VALUE]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[CONTINUE]], label [[BB_1]], label [[BB_4]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       bb.4:
-; CHECK-NEXT:    ret void
+; CHECK:         [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK:         [[TMP5:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
 ;
 bb.0:
   br label %bb.1

diff  --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index baecc032c78e2..bdcf6a4109e64 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -55,7 +55,7 @@ define void @pr45679(i32* %A) optsize {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -67,7 +67,7 @@ define void @pr45679(i32* %A) optsize {
 ; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -117,7 +117,7 @@ define void @pr45679(i32* %A) optsize {
 ; VF2UF2-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; VF2UF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
 ; VF2UF2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; VF2UF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2UF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; VF2UF2:       middle.block:
 ; VF2UF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2UF2:       scalar.ph:
@@ -129,7 +129,7 @@ define void @pr45679(i32* %A) optsize {
 ; VF2UF2-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; VF2UF2-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; VF2UF2-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; VF2UF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF2UF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2
 ; VF2UF2:       exit:
 ; VF2UF2-NEXT:    ret void
 ;
@@ -139,46 +139,46 @@ define void @pr45679(i32* %A) optsize {
 ; VF1UF4:       vector.ph:
 ; VF1UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF1UF4:       vector.body:
-; VF1UF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; VF1UF4-NEXT:    [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
-; VF1UF4-NEXT:    [[VEC_IV4:%.*]] = add i32 [[INDEX]], 1
-; VF1UF4-NEXT:    [[VEC_IV5:%.*]] = add i32 [[INDEX]], 2
-; VF1UF4-NEXT:    [[VEC_IV6:%.*]] = add i32 [[INDEX]], 3
-; VF1UF4-NEXT:    [[TMP0:%.*]] = icmp ule i32 [[VEC_IV]], 13
-; VF1UF4-NEXT:    [[TMP1:%.*]] = icmp ule i32 [[VEC_IV4]], 13
-; VF1UF4-NEXT:    [[TMP2:%.*]] = icmp ule i32 [[VEC_IV5]], 13
-; VF1UF4-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[VEC_IV6]], 13
+; VF1UF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; VF1UF4-NEXT:    [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
+; VF1UF4-NEXT:    [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1
+; VF1UF4-NEXT:    [[INDUCTION2:%.*]] = add i32 [[INDEX]], 2
+; VF1UF4-NEXT:    [[INDUCTION3:%.*]] = add i32 [[INDEX]], 3
+; VF1UF4-NEXT:    [[TMP0:%.*]] = icmp ule i32 [[INDUCTION]], 13
+; VF1UF4-NEXT:    [[TMP1:%.*]] = icmp ule i32 [[INDUCTION1]], 13
+; VF1UF4-NEXT:    [[TMP2:%.*]] = icmp ule i32 [[INDUCTION2]], 13
+; VF1UF4-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[INDUCTION3]], 13
 ; VF1UF4-NEXT:    br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; VF1UF4:       pred.store.if:
-; VF1UF4-NEXT:    [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
-; VF1UF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDUCTION]]
+; VF1UF4-NEXT:    [[SUNK_IND0:%.*]] = add i32 [[INDEX]], 0
+; VF1UF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[SUNK_IND0]]
 ; VF1UF4-NEXT:    store i32 13, i32* [[TMP4]], align 1
 ; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VF1UF4:       pred.store.continue:
-; VF1UF4-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; VF1UF4-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
 ; VF1UF4:       pred.store.if7:
-; VF1UF4-NEXT:    [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1
-; VF1UF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDUCTION1]]
+; VF1UF4-NEXT:    [[SUNK_IND1:%.*]] = add i32 [[INDEX]], 1
+; VF1UF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[SUNK_IND1]]
 ; VF1UF4-NEXT:    store i32 13, i32* [[TMP5]], align 1
-; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE5]]
 ; VF1UF4:       pred.store.continue8:
-; VF1UF4-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; VF1UF4-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
 ; VF1UF4:       pred.store.if9:
-; VF1UF4-NEXT:    [[INDUCTION2:%.*]] = add i32 [[INDEX]], 2
-; VF1UF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDUCTION2]]
+; VF1UF4-NEXT:    [[SUNK_IND2:%.*]] = add i32 [[INDEX]], 2
+; VF1UF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[SUNK_IND2]]
 ; VF1UF4-NEXT:    store i32 13, i32* [[TMP6]], align 1
-; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE7]]
 ; VF1UF4:       pred.store.continue10:
-; VF1UF4-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
+; VF1UF4-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
 ; VF1UF4:       pred.store.if11:
-; VF1UF4-NEXT:    [[INDUCTION3:%.*]] = add i32 [[INDEX]], 3
-; VF1UF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDUCTION3]]
+; VF1UF4-NEXT:    [[SUNK_IND3:%.*]] = add i32 [[INDEX]], 3
+; VF1UF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[SUNK_IND3]]
 ; VF1UF4-NEXT:    store i32 13, i32* [[TMP7]], align 1
-; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; VF1UF4-NEXT:    br label [[PRED_STORE_CONTINUE9]]
 ; VF1UF4:       pred.store.continue12:
 ; VF1UF4-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; VF1UF4-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; VF1UF4-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF1UF4-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
 ; VF1UF4:       middle.block:
 ; VF1UF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF1UF4:       scalar.ph:
@@ -190,7 +190,7 @@ define void @pr45679(i32* %A) optsize {
 ; VF1UF4-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; VF1UF4-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; VF1UF4-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; VF1UF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF1UF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]]
 ; VF1UF4:       exit:
 ; VF1UF4-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
index e583715fd44f1..9d39e817bfa73 100644
--- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
@@ -29,32 +29,32 @@ define void @test(i16 %x, i64 %y, i32* %ptr) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[INC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[OFFSET_IDX]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INC]] to i8
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i8 0, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i8 [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX3:%.*]] = mul i64 [[INDEX]], [[INC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[OFFSET_IDX3]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[INC]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i8 0, [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT10]], <i64 0, i64 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    store i32 0, i32* [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.if3:
 ; CHECK-NEXT:    store i32 0, i32* [[PTR]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i8 [[TMP10]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
index 6abc352b98493..db69de6d36610 100644
--- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
@@ -51,8 +51,8 @@ define void @f() {
 ; CHECK-NEXT:    store i8 10, i8* [[TMP0]], align 1
 ; CHECK-NEXT:    store i8 10, i8* [[TMP0]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 500, 500
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
index 87b5130ca6bec..8ace7dea234e7 100644
--- a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
@@ -15,13 +15,13 @@ define dso_local i16 @reverse_interleave_load_fold_mask() optsize {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE4]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i16 41, [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IV]], <i32 40, i32 40>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -39,7 +39,7 @@ define dso_local i16 @reverse_interleave_load_fold_mask() optsize {
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.if1:
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i16 [[OFFSET_IDX]], -1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i16 [[TMP14]], -1
@@ -49,10 +49,10 @@ define dso_local i16 @reverse_interleave_load_fold_mask() optsize {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [40 x [4 x i16]], [40 x [4 x i16]]* @A, i16 0, i16 [[TMP15]], i16 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP20]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x i16> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x i16> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <2 x i16> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25]] = add <2 x i16> [[VEC_PHI]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]]

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index fe7e74cd1990e..8be98ebf087d5 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s
 
 define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
@@ -446,7 +445,7 @@ define i32 @simple_chained_rdx(i32* noalias %a, i32* noalias %b, i32* noalias %c
 ; CHECK-NEXT:    [[TMP51]] = add i32 [[TMP50]], [[TMP28]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -473,7 +472,7 @@ define i32 @simple_chained_rdx(i32* noalias %a, i32* noalias %b, i32* noalias %c
 ; CHECK-NEXT:    [[RES]] = phi i32 [ [[ADD3]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
@@ -517,139 +516,11 @@ for.end:
 ;
 define i64 @nested_cond_and(i64* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, i64* noalias nocapture readonly %cond, i64 %N){
 ; CHECK-LABEL: @nested_cond_and(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 5, i64 -1, i64 -1, i64 -1>, [[VECTOR_PH]] ], [ [[PREDPHI15:%.*]], [[PRED_LOAD_CONTINUE14]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[TMP3]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP13]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, i64* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP18]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i64> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, i64* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP23]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i64> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq <4 x i64> [[TMP25]], <i64 3, i64 3, i64 3, i64 3>
-; CHECK-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP26]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP27]], i64 0
-; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; CHECK:       pred.load.if7:
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i64, i64* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
-; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x i64> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP31]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP27]], i64 1
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK:       pred.load.if9:
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i64, i64* [[TMP34]], align 4
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP35]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
-; CHECK:       pred.load.continue10:
-; CHECK-NEXT:    [[TMP37:%.*]] = phi <4 x i64> [ [[TMP32]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP27]], i64 2
-; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK:       pred.load.if11:
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i64, i64* [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i64> [[TMP37]], i64 [[TMP40]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
-; CHECK:       pred.load.continue12:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi <4 x i64> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP41]], [[PRED_LOAD_IF11]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i1> [[TMP27]], i64 3
-; CHECK-NEXT:    br i1 [[TMP43]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.if13:
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load i64, i64* [[TMP44]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i64> [[TMP42]], i64 [[TMP45]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.continue14:
-; CHECK-NEXT:    [[TMP47:%.*]] = phi <4 x i64> [ [[TMP42]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP46]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP49]], <4 x i64> [[TMP25]], <4 x i64> [[TMP47]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[PREDPHI_V]], <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
-; CHECK-NEXT:    [[PREDPHI15]] = and <4 x i64> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NOT:     @llvm.vector.reduce.and
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> [[PREDPHI15]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 5, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP52:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[TMP52]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP53:%.*]] = load i64, i64* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[RDX]], [[TMP53]]
-; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i64 [[TMP53]], 3
-; CHECK-NEXT:    br i1 [[TOBOOL2]], label [[IF_THEN_2:%.*]], label [[FOR_INC]]
-; CHECK:       if.then.2:
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP54:%.*]] = load i64, i64* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[RDX]], [[TMP54]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[RES]] = phi i64 [ [[AND2]], [[IF_THEN_2]] ], [ [[AND1]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES_LCSSA]]
-;
+; CHECK:         @llvm.vector.reduce.and
+; CHECK:       scalar.ph
 entry:
   br label %for.body
 
@@ -690,100 +561,11 @@ for.end:
 ;
 define i32 @cond-uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %n) #0 {
 ; CHECK-LABEL: @cond-uncond(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP12]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP18]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP25]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> zeroinitializer, <4 x i32> [[TMP26]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP30]] = add <4 x i32> [[WIDE_LOAD7]], [[PREDPHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NOT:     @llvm.vector.reduce.add
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[RDX1:%.*]] = phi i32 [ [[ADD2:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP33]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP34]], [[RDX1]]
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RDX1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD2]] = add nsw i32 [[TMP35]], [[RES]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD2]], [[IF_END]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD2_LCSSA]]
-;
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
 entry:
   br label %for.body
 
@@ -820,140 +602,11 @@ for.end:
 ;
 define float @cond_cond(float* noalias %src1, float* noalias %src2, float* noalias %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_cond(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 2.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[PREDPHI15:%.*]], [[PRED_LOAD_CONTINUE14]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP8]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP13]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x float> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP18]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP23]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x float> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP25]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP27]], i64 0
-; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; CHECK:       pred.load.if7:
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
-; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x float> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP31]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP27]], i64 1
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK:       pred.load.if9:
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load float, float* [[TMP34]], align 4
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP35]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
-; CHECK:       pred.load.continue10:
-; CHECK-NEXT:    [[TMP37:%.*]] = phi <4 x float> [ [[TMP32]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP27]], i64 2
-; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK:       pred.load.if11:
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP40]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
-; CHECK:       pred.load.continue12:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi <4 x float> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP41]], [[PRED_LOAD_IF11]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i1> [[TMP27]], i64 3
-; CHECK-NEXT:    br i1 [[TMP43]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.if13:
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[TMP44]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP45]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
-; CHECK-NEXT:    [[TMP47:%.*]] = phi <4 x float> [ [[TMP42]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP46]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP27]], <4 x float> [[TMP47]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[PREDPHI15]] = fadd fast <4 x float> [[PREDPHI]], [[TMP48]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NOT:     @llvm.vector.reduce.fadd
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP50:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI15]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 2.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[RDX1:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast oeq float [[TMP51]], 3.000000e+00
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP52:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP52]], [[RDX1]]
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[RDX2:%.*]] = phi float [ [[ADD]], [[IF_THEN]] ], [ [[RDX1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP5:%.*]] = fcmp fast oeq float [[TMP51]], 7.000000e+00
-; CHECK-NEXT:    br i1 [[CMP5]], label [[IF_THEN6:%.*]], label [[FOR_INC]]
-; CHECK:       if.then6:
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP53:%.*]] = load float, float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast float [[TMP53]], [[RDX2]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[RES]] = phi float [ [[ADD2]], [[IF_THEN6]] ], [ [[RDX2]], [[IF_END]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP50]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RES_LCSSA]]
-;
+; CHECK:         @llvm.vector.reduce.fadd
+; CHECK:       scalar.ph
 entry:
   br label %for.body
 
@@ -998,100 +651,11 @@ for.end:
 ;
 define i32 @uncond_cond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 %N) #0 {
 ; CHECK-LABEL: @uncond_cond(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE7:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE7]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3:%.*]]
-; CHECK:       pred.load.if2:
-; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP15]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE3]]
-; CHECK:       pred.load.continue3:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF2]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP21]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE3]] ], [ [[TMP22]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3
-; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP27]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP28]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP30:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> [[TMP29]]
-; CHECK-NEXT:    [[PREDPHI]] = add <4 x i32> [[TMP2]], [[TMP30]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NOT:     @llvm.vector.reduce.add
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PREDPHI]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP33]], [[RDX]]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP34]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP35]], [[ADD1]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[RES]] = phi i32 [ [[ADD2]], [[IF_THEN]] ], [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
-;
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
 entry:
   br label %for.body
 
@@ -1128,102 +692,11 @@ for.end:
 ;
 define i32 @uncond_cond_uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %N) {
 ; CHECK-LABEL: @uncond_cond_uncond(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE7:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[PRED_LOAD_CONTINUE7]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3:%.*]]
-; CHECK:       pred.load.if2:
-; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP15]], i64 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE3]]
-; CHECK:       pred.load.continue3:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF2]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP21]], i64 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE3]] ], [ [[TMP22]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3
-; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP27]], i64 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP28]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP30:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> [[TMP29]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = add <4 x i32> [[TMP2]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31]] = add <4 x i32> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NOT:     @llvm.vector.reduce.add
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP31]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[ADD3:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP34]], [[RDX]]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP35]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP36]], [[ADD1]]
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[ADD2]], [[IF_THEN]] ], [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD3]] = add nsw i32 [[RES]], [[TMP34]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi i32 [ [[ADD3]], [[IF_END]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD3_LCSSA]]
-;
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index be27e22bb76e4..cc3d6b10f9f4c 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -58,21 +58,21 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD3]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -166,21 +166,21 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD3]])
 ; CHECK-NEXT:    [[TMP9]] = mul i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
@@ -226,20 +226,20 @@ define i32 @reduction_mix(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8]] = add i32 [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
@@ -939,21 +939,21 @@ define i32 @reduction_predicated(i32* noalias nocapture %A, i32* noalias nocaptu
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD3]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       middle.block:

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
index f055ec768a92c..601c9af4f0618 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
@@ -1,141 +1,88 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt %s -loop-vectorize -force-vector-interleave=3 -force-vector-width=4 -S | FileCheck --check-prefix=UF3 %s
 ; RUN: opt %s -loop-vectorize -force-vector-interleave=5 -force-vector-width=4 -S | FileCheck --check-prefix=UF5 %s
 
 define i32 @reduction_sum(i64 %n, i32* noalias nocapture %A) {
-; UF3-LABEL: @reduction_sum(
-; UF3-NEXT:  entry:
-; UF3-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; UF3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 12
-; UF3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UF3:       vector.ph:
-; UF3-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 12
-; UF3-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; UF3-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UF3:       vector.body:
-; UF3-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; UF3-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; UF3-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
-; UF3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; UF3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; UF3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; UF3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; UF3-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; UF3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; UF3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; UF3-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; UF3-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; UF3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; UF3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; UF3-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; UF3-NEXT:    [[TMP13]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; UF3-NEXT:    [[TMP14]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD3]]
-; UF3-NEXT:    [[TMP15]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD4]]
-; UF3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; UF3-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UF3-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; UF3:       middle.block:
-; UF3-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]]
-; UF3-NEXT:    [[BIN_RDX5:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX]]
-; UF3-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX5]])
-; UF3-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; UF3-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; UF3:       scalar.ph:
-; UF3-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UF3-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; UF3-NEXT:    br label [[LOOP:%.*]]
-; UF3:       loop:
-; UF3-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UF3-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ]
-; UF3-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; UF3-NEXT:    [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4
-; UF3-NEXT:    [[SUM_NEXT]] = add i32 [[SUM_02]], [[LV_A]]
-; UF3-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; UF3-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
-; UF3-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; UF3:       exit:
-; UF3-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; UF3-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; UF3-LABEL: vector.body:
+; UF3-NEXT:   [[IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[IV_NEXT:%.+]], %vector.body ]
+; UF3-NEXT:   [[SUM0:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM0_NEXT:%.+]], %vector.body ]
+; UF3-NEXT:   [[SUM1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM1_NEXT:%.+]], %vector.body ]
+; UF3-NEXT:   [[SUM2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM2_NEXT:%.+]],  %vector.body ]
+; UF3-NEXT:   [[IV0:%.+]] = add i64 [[IV]], 0
+; UF3-NEXT:   [[IV1:%.+]] = add i64 [[IV]], 4
+; UF3-NEXT:   [[IV2:%.+]] = add i64 [[IV]], 8
+; UF3-NEXT:   [[GEP0:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV0]]
+; UF3-NEXT:   [[GEP1:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV1]]
+; UF3-NEXT:   [[GEP2:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV2]]
+; UF3-NEXT:   [[L_GEP0:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
+; UF3-NEXT:   [[BC0:%.+]] = bitcast i32* [[L_GEP0]] to <4 x i32>*
+; UF3-NEXT:   [[L0:%.+]] = load <4 x i32>, <4 x i32>* [[BC0]], align 4
+; UF3-NEXT:   [[L_GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 4
+; UF3-NEXT:   [[BC1:%.+]] = bitcast i32* [[L_GEP1]] to <4 x i32>*
+; UF3-NEXT:   [[L1:%.+]] = load <4 x i32>, <4 x i32>* [[BC1]], align 4
+; UF3-NEXT:   [[L_GEP2:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 8
+; UF3-NEXT:   [[BC2:%.+]] = bitcast i32* [[L_GEP2]] to <4 x i32>*
+; UF3-NEXT:   [[L2:%.+]] = load <4 x i32>, <4 x i32>* [[BC2]], align 4
+; UF3-NEXT:   [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
+; UF3-NEXT:   [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]]
+; UF3-NEXT:   [[SUM2_NEXT]] = add <4 x i32> [[SUM2]], [[L2]]
+; UF3-NEXT:   [[IV_NEXT]] = add nuw i64 [[IV]], 12
+; UF3-NEXT:   [[EC:%.+]] = icmp eq i64 [[IV_NEXT]], %n.vec
+; UF3-NEXT:   br i1 [[EC]], label %middle.block, label %vector.body
 ;
-; UF5-LABEL: @reduction_sum(
-; UF5-NEXT:  entry:
-; UF5-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; UF5-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 20
-; UF5-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UF5:       vector.ph:
-; UF5-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 20
-; UF5-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; UF5-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UF5:       vector.body:
-; UF5-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; UF5-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; UF5-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
-; UF5-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 12
-; UF5-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 16
-; UF5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; UF5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; UF5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; UF5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; UF5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; UF5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; UF5-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; UF5-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; UF5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 4
-; UF5-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; UF5-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; UF5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
-; UF5-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; UF5-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
-; UF5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 12
-; UF5-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; UF5-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; UF5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
-; UF5-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <4 x i32>*
-; UF5-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP20]], align 4
-; UF5-NEXT:    [[TMP21]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; UF5-NEXT:    [[TMP22]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD5]]
-; UF5-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD6]]
-; UF5-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD7]]
-; UF5-NEXT:    [[TMP25]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD8]]
-; UF5-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 20
-; UF5-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UF5-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; UF5:       middle.block:
-; UF5-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP22]], [[TMP21]]
-; UF5-NEXT:    [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP23]], [[BIN_RDX]]
-; UF5-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP24]], [[BIN_RDX9]]
-; UF5-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP25]], [[BIN_RDX10]]
-; UF5-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; UF5-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; UF5-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; UF5:       scalar.ph:
-; UF5-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UF5-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; UF5-NEXT:    br label [[LOOP:%.*]]
-; UF5:       loop:
-; UF5-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UF5-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ]
-; UF5-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; UF5-NEXT:    [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4
-; UF5-NEXT:    [[SUM_NEXT]] = add i32 [[SUM_02]], [[LV_A]]
-; UF5-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; UF5-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
-; UF5-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; UF5:       exit:
-; UF5-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; UF5-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; UF3-LABEL: middle.block:
+; UF3-NEXT:   [[RDX0:%.+]] = add <4 x i32> [[SUM1_NEXT]], [[SUM0_NEXT]]
+; UF3-NEXT:   [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[RDX0]]
+; UF3-NEXT:   call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX1]])
 ;
 
+; UF5-LABEL: vector.body:
+; UF5-NEXT:   [[IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[IV_NEXT:%.+]], %vector.body ]
+; UF5-NEXT:   [[SUM0:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM0_NEXT:%.+]], %vector.body ]
+; UF5-NEXT:   [[SUM1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM1_NEXT:%.+]], %vector.body ]
+; UF5-NEXT:   [[SUM2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM2_NEXT:%.+]],  %vector.body ]
+; UF5-NEXT:   [[SUM3:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM3_NEXT:%.+]], %vector.body ]
+; UF5-NEXT:   [[SUM4:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM4_NEXT:%.+]], %vector.body ]
+; UF5-NEXT:   [[IV0:%.+]] = add i64 [[IV]], 0
+; UF5-NEXT:   [[IV1:%.+]] = add i64 [[IV]], 4
+; UF5-NEXT:   [[IV2:%.+]] = add i64 [[IV]], 8
+; UF5-NEXT:   [[IV3:%.+]] = add i64 [[IV]], 12
+; UF5-NEXT:   [[IV4:%.+]] = add i64 [[IV]], 16
+; UF5-NEXT:   [[GEP0:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV0]]
+; UF5-NEXT:   [[GEP1:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV1]]
+; UF5-NEXT:   [[GEP2:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV2]]
+; UF5-NEXT:   [[GEP3:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV3]]
+; UF5-NEXT:   [[GEP4:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV4]]
+; UF5-NEXT:   [[L_GEP0:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
+; UF5-NEXT:   [[BC0:%.+]] = bitcast i32* [[L_GEP0]] to <4 x i32>*
+; UF5-NEXT:   [[L0:%.+]] = load <4 x i32>, <4 x i32>* [[BC0]], align 4
+; UF5-NEXT:   [[L_GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 4
+; UF5-NEXT:   [[BC1:%.+]] = bitcast i32* [[L_GEP1]] to <4 x i32>*
+; UF5-NEXT:   [[L1:%.+]] = load <4 x i32>, <4 x i32>* [[BC1]], align 4
+; UF5-NEXT:   [[L_GEP2:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 8
+; UF5-NEXT:   [[BC2:%.+]] = bitcast i32* [[L_GEP2]] to <4 x i32>*
+; UF5-NEXT:   [[L2:%.+]] = load <4 x i32>, <4 x i32>* [[BC2]], align 4
+; UF5-NEXT:   [[L_GEP3:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 12
+; UF5-NEXT:   [[BC3:%.+]] = bitcast i32* [[L_GEP3]] to <4 x i32>*
+; UF5-NEXT:   [[L3:%.+]] = load <4 x i32>, <4 x i32>* [[BC3]], align 4
+; UF5-NEXT:   [[L_GEP4:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 16
+; UF5-NEXT:   [[BC4:%.+]] = bitcast i32* [[L_GEP4]] to <4 x i32>*
+; UF5-NEXT:   [[L4:%.+]] = load <4 x i32>, <4 x i32>* [[BC4]], align 4
+; UF5-NEXT:   [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
+; UF5-NEXT:   [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]]
+; UF5-NEXT:   [[SUM2_NEXT]] = add <4 x i32> [[SUM2]], [[L2]]
+; UF5-NEXT:   [[SUM3_NEXT]] = add <4 x i32> [[SUM3]], [[L3]]
+; UF5-NEXT:   [[SUM4_NEXT]] = add <4 x i32> [[SUM4]], [[L4]]
+; UF5-NEXT:   [[IV_NEXT]] = add nuw i64 [[IV]], 20
+; UF5-NEXT:   [[EC:%.+]] = icmp eq i64 [[IV_NEXT]], %n.vec
+; UF5-NEXT:   br i1 [[EC]], label %middle.block, label %vector.body
+;
+; UF5-LABEL: middle.block:
+; UF5-NEXT:   [[RDX0:%.+]] = add <4 x i32> [[SUM1_NEXT]], [[SUM0_NEXT]]
+; UF5-NEXT:   [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[RDX0]]
+; UF5-NEXT:   [[RDX2:%.+]] = add <4 x i32> [[SUM3_NEXT]], [[RDX1]]
+; UF5-NEXT:   [[RDX3:%.+]] = add <4 x i32> [[SUM4_NEXT]], [[RDX2]]
+; UF5-NEXT:   call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX3]])
+;
 
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
index 7a0062a07af76..8f09dde0812d9 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -march=r600 -mcpu=cayman -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
 
 ; Check vectorization that would ordinarily require a runtime bounds
@@ -28,60 +27,8 @@ target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 ; space, so this should vectorize normally.
 define void @foo(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %n) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[N]] to i16
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 3
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP11]]
-; CHECK-NEXT:    store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n
@@ -107,27 +54,8 @@ for.end:                                          ; preds = %for.body, %entry
 ; Parameters are unidentified and 
diff erent address spaces, so cannot vectorize.
 define void @bar0(i32* %a, i32 addrspace(1)* %b, i32 %n) #0 {
 ; CHECK-LABEL: @bar0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], 3
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_02]]
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n
@@ -153,27 +81,8 @@ for.end:                                          ; preds = %for.body, %entry
 ; Swapped arguments should be the same
 define void @bar1(i32 addrspace(1)* %a, i32* %b, i32 %n) #0 {
 ; CHECK-LABEL: @bar1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_02]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A:%.*]], i16 [[TMP1]]
-; CHECK-NEXT:    store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n
@@ -200,50 +109,8 @@ for.end:                                          ; preds = %for.body, %entry
 ; address spaces are 
diff erent.
 define void @bar2(i32* noalias %a, i32 addrspace(1)* noalias %b, i32 %n) #0 {
 ; CHECK-LABEL: @bar2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[TMP1]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 3
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_02]]
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n
@@ -270,27 +137,8 @@ for.end:                                          ; preds = %for.body, %entry
 ; generally safe and shouldn't be vectorized.
 define void @arst0(i32* %b, i32 %n) #0 {
 ; CHECK-LABEL: @arst0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_02]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP1]]
-; CHECK-NEXT:    store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n
@@ -318,27 +166,8 @@ for.end:                                          ; preds = %for.body, %entry
 ; This isn't generally safe and shouldn't be vectorized.
 define void @arst1(i32* %b, i32 %n) #0 {
 ; CHECK-LABEL: @arst1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], 3
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_02]]
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n
@@ -365,50 +194,8 @@ for.end:                                          ; preds = %for.body, %entry
 ; spaces. This should be vectorized.
 define void @aoeu(i32 %n) #0 {
 ; CHECK-LABEL: @aoeu(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(2)* @q_as2, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(2)* [[TMP0]] to <4 x i32> addrspace(2)*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(2)* [[TMP1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32> addrspace(1)* [[TMP5]], align 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(2)* @q_as2, i32 0, i32 [[I_02]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(2)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP8]]
-; CHECK-NEXT:    store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
 
 entry:
   %cmp1 = icmp slt i32 0, %n

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
index 1a1820fb0a739..e81ea80c795fc 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
@@ -12,26 +12,26 @@ define void @test(float* %A, i32 %x) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[X]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[X]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-pointer-element-type.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-pointer-element-type.ll
index ec15220ad5109..e055888a37696 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-pointer-element-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-pointer-element-type.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=2 %s -S -debug 2>&1 | FileCheck %s
 ; RUN: opt -passes='loop-vectorize' -force-vector-width=2 %s -S -debug 2>&1 | FileCheck %s
 
@@ -11,130 +10,14 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; addition of the size of the element type (a pointer) for the end bound.
 
 define void @test(i64 %arg, i32 %arg1, i8** %base) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG:%.*]], 1
-; CHECK-NEXT:    [[SMIN19:%.*]] = call i64 @llvm.smin.i64(i64 [[ARG]], i64 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN19]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[ARG]], i64 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[ARG]], [[SMIN]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 [[ARG1:%.*]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP4]])
-; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP2]], 4294967295
-; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8*, i8** [[BASE:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP2]])
-; CHECK-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = bitcast i8** [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 0, [[MUL_RESULT2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[SCEVGEP4]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i8* [[TMP12]], [[SCEVGEP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[MUL_OVERFLOW3]]
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[ARG]]
-; CHECK-NEXT:    [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP2]])
-; CHECK-NEXT:    [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1
-; CHECK-NEXT:    [[SCEVGEP59:%.*]] = bitcast i8** [[SCEVGEP5]] to i8*
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 0, [[MUL_RESULT7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[SCEVGEP59]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i8* [[TMP16]], [[SCEVGEP59]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW8]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP9]], [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMIN10:%.*]] = call i64 @llvm.smin.i64(i64 [[ARG]], i64 1)
-; CHECK-NEXT:    [[TMP21:%.*]] = add nsw i32 [[ARG1]], -1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SMIN10]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], [[ARG]]
-; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[TMP24]]
-; CHECK-NEXT:    [[SCEVGEP1112:%.*]] = bitcast i8** [[SCEVGEP11]] to i8*
-; CHECK-NEXT:    [[TMP25:%.*]] = add nuw nsw i64 [[TMP22]], 1
-; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[TMP25]]
-; CHECK-NEXT:    [[SCEVGEP1314:%.*]] = bitcast i8** [[SCEVGEP13]] to i8*
-; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[SMIN10]]
-; CHECK-NEXT:    [[SCEVGEP1516:%.*]] = bitcast i8** [[SCEVGEP15]] to i8*
-; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[ARG]], 1
-; CHECK-NEXT:    [[SCEVGEP17:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[TMP26]]
-; CHECK-NEXT:    [[SCEVGEP1718:%.*]] = bitcast i8** [[SCEVGEP17]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1112]], [[SCEVGEP1718]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1516]], [[SCEVGEP1314]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[ARG]], [[N_VEC]]
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END21:%.*]] = sub i32 [[ARG1]], [[CAST_CRD]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[ARG1]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = sub i64 [[ARG]], [[INDEX]]
-; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[OFFSET_IDX22]], 0
-; CHECK-NEXT:    [[TMP30:%.*]] = add nsw i32 [[TMP28]], -1
-; CHECK-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP30]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8*, i8** [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8*, i8** [[TMP33]], i32 -1
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast i8** [[TMP34]] to <2 x i8*>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP35]], align 8, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i8*> [[WIDE_LOAD]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[TMP29]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8*, i8** [[TMP36]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8*, i8** [[TMP37]], i32 -1
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8** [[TMP38]] to <2 x i8*>*
-; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP39]], align 8, !alias.scope !3
-; CHECK-NEXT:    [[REVERSE24:%.*]] = shufflevector <2 x i8*> [[WIDE_LOAD23]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <2 x i8*> [[REVERSE24]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8** [[TMP34]] to <2 x i8*>*
-; CHECK-NEXT:    store <2 x i8*> [[REVERSE25]], <2 x i8*>* [[TMP40]], align 8, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[REVERSE26:%.*]] = shufflevector <2 x i8*> [[REVERSE]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast i8** [[TMP38]] to <2 x i8*>*
-; CHECK-NEXT:    store <2 x i8*> [[REVERSE26]], <2 x i8*>* [[TMP41]], align 8, !alias.scope !3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[ARG]], [[ENTRY:%.*]] ], [ [[ARG]], [[VECTOR_SCEVCHECK]] ], [ [[ARG]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END21]], [[MIDDLE_BLOCK]] ], [ [[ARG1]], [[ENTRY]] ], [ [[ARG1]], [[VECTOR_SCEVCHECK]] ], [ [[ARG1]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2]], -1
-; CHECK-NEXT:    [[IV_2_EXT:%.*]] = zext i32 [[IV_2_NEXT]] to i64
-; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[IV_2_EXT]]
-; CHECK-NEXT:    [[V_1:%.*]] = load i8*, i8** [[IDX_1]], align 8
-; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[IV_1]]
-; CHECK-NEXT:    [[V_2:%.*]] = load i8*, i8** [[IDX_2]], align 8
-; CHECK-NEXT:    store i8* [[V_2]], i8** [[IDX_1]], align 8
-; CHECK-NEXT:    store i8* [[V_1]], i8** [[IDX_2]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i64 [[IV_1]], 1
-; CHECK-NEXT:    [[IV_1_NEXT]] = add nsw i64 [[IV_1]], -1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-
+; CHECK:      LAA: Adding RT check for range:
+; CHECK-NEXT:  Start: ((8 * (zext i32 (-1 + %arg1)<nsw> to i64))<nuw><nsw> + (8 * (1 smin %arg)) + (-8 * %arg) + %base)
+; CHECK-SAME:  End: (8 + (8 * (zext i32 (-1 + %arg1)<nsw> to i64))<nuw><nsw> + %base)
+; CHECK-NEXT: LAA: Adding RT check for range:
+; CHECK-NEXT:  Start: ((8 * (1 smin %arg)) + %base)
+; CHECK-SAME:  End: (8 + (8 * %arg) + %base)
+
+; CHECK: vector.body
 
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
index 11843347719e0..c02a4915729e5 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
 
 ; Artificial datalayout
@@ -7,63 +6,8 @@ target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 
 define void @add_ints_1_1_1(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
 ; CHECK-LABEL: @add_ints_1_1_1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i16 200
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i16 200
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[C:%.*]], i16 200
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP7]], [[A]]
-; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[C]]
-; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[TMP1]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[C]], i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD12]]
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !5, !noalias !7
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_01:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[C]], i16 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP15]]
-; CHECK-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
 
 entry:
   br label %for.body
@@ -87,26 +31,8 @@ for.end:                                          ; preds = %for.body
 
 define void @add_ints_as_1_0_0(i32 addrspace(1)* %a, i32* %b, i32* %c) #0 {
 ; CHECK-LABEL: @add_ints_as_1_0_0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A:%.*]], i16 [[TMP4]]
-; CHECK-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   br label %for.body
@@ -130,26 +56,8 @@ for.end:                                          ; preds = %for.body
 
 define void @add_ints_as_0_1_0(i32* %a, i32 addrspace(1)* %b, i32* %c) #0 {
 ; CHECK-LABEL: @add_ints_as_0_1_0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   br label %for.body
@@ -173,26 +81,8 @@ for.end:                                          ; preds = %for.body
 
 define void @add_ints_as_0_1_1(i32* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
 ; CHECK-LABEL: @add_ints_as_0_1_1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[C:%.*]], i16 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   br label %for.body
@@ -216,26 +106,8 @@ for.end:                                          ; preds = %for.body
 
 define void @add_ints_as_0_1_2(i32* %a, i32 addrspace(1)* %b, i32 addrspace(2)* %c) #0 {
 ; CHECK-LABEL: @add_ints_as_0_1_2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32 addrspace(2)* [[C:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(2)* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index 79ac5a7eefd62..aff3b5cf75e42 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -1,13 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) {
 ; CHECK-LABEL: @add_ints(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
+; CHECK-LABEL: vector.memcheck:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 200
 ; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 200
 ; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 200
@@ -18,44 +15,10 @@ define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) {
 ; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
 ; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %scalar.ph, label %vector.ph
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD12]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4, !alias.scope !5, !noalias !7
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 200
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index 617f5b4da6aae..53b2d9e103eab 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -19,15 +19,15 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
@@ -39,20 +39,20 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = urem i32 [[TMP5]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = urem i32 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -102,15 +102,15 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
@@ -122,20 +122,20 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = urem i32 [[TMP5]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP13]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = urem i32 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !11
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -264,26 +264,26 @@ define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = urem i32 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP7]], <2 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP15]], <2 x i32>* [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index eb0f0cfe49f53..808cb70b59999 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -1,81 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=2  -S | FileCheck %s
 
 define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = fcmp ogt <vscale x 2 x float> [[WIDE_LOAD]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+02, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp ogt <vscale x 2 x float> [[WIDE_LOAD1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+02, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[TMP18]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP20]])
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP19]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP21]])
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP23:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 2 x float>*
-; CHECK-NEXT:    store <vscale x 2 x float> [[TMP22]], <vscale x 2 x float>* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], 2
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <vscale x 2 x float>*
-; CHECK-NEXT:    store <vscale x 2 x float> [[TMP23]], <vscale x 2 x float>* [[TMP31]], align 4
-; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP35]], 1.000000e+02
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP1]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP35]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK:         [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1L0]])
+; CHECK-NEXT:    [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2L0]])
 entry:
   br label %for.body
 
@@ -104,92 +37,14 @@ attributes #0 = { nounwind willreturn }
 
 define void @test2(float *%a, float *%b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B1:%.*]] = bitcast float* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[A3:%.*]] = bitcast float* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[PTRINT1:%.*]] = ptrtoint float* [[A]] to i64
-; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[PTRINT1]], 0
-; CHECK-NEXT:    [[PTRINT2:%.*]] = ptrtoint float* [[B]] to i64
-; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[PTRINT2]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[B]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       entry:
+; CHECK:         [[MASKCOND:%.*]] = icmp eq i64 %ptrint1, 0
+; CHECK:         [[MASKCOND4:%.*]] = icmp eq i64 %ptrint2, 0
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP13]], align 4, !alias.scope !4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP17]], align 4, !alias.scope !4
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <vscale x 2 x float>*
-; CHECK-NEXT:    store <vscale x 2 x float> [[TMP18]], <vscale x 2 x float>* [[TMP23]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 2 x float>*
-; CHECK-NEXT:    store <vscale x 2 x float> [[TMP19]], <vscale x 2 x float>* [[TMP27]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   %ptrint1 = ptrtoint float* %a to i64
   %maskcond = icmp eq i64 %ptrint1, 0
@@ -221,99 +76,9 @@ for.end:                                          ; preds = %for.body
 define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) {
 ; Check that the vector.body does not contain any assumes.
 ; CHECK-LABEL: @predicated_assume(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 495616, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult <vscale x 2 x i64> [[STEP_ADD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 495616, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 991232, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult <vscale x 2 x i64> [[STEP_ADD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 991232, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP16]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <vscale x 2 x i1> [[TMP17]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 2.300000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 4.200000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 2.300000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 4.200000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = fmul <vscale x 2 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP31:%.*]] = fmul <vscale x 2 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <vscale x 2 x float>*
-; CHECK-NEXT:    store <vscale x 2 x float> [[TMP30]], <vscale x 2 x float>* [[TMP35]], align 4
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 2
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <vscale x 2 x float>*
-; CHECK-NEXT:    store <vscale x 2 x float> [[TMP31]], <vscale x 2 x float>* [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NOT:     llvm.assume
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT:    br label [[IF_END5]]
-; CHECK:       if.end5:
-; CHECK-NEXT:    [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X_0]], [[TMP43]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
index f10641a0d1c50..813dfbaa40b51 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
@@ -1,79 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=4 -force-vector-interleave=2  -S | FileCheck %s
 
 define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD1]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], <vscale x 4 x float>* [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], <vscale x 4 x float>* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP31]], 1.000000e+02
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
@@ -97,91 +35,16 @@ declare void @llvm.experimental.noalias.scope.decl(metadata)
 %struct.data = type { float*, float* }
 
 define void @test2(float* %a, float* %b) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
-; CHECK-NEXT:    [[B3:%.*]] = bitcast float* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[B]] to i64
-; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[PTRINT]], 0
-; CHECK-NEXT:    [[PTRINT2:%.*]] = ptrtoint float* [[A]] to i64
-; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[PTRINT2]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[B]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP13]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP17]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], <vscale x 4 x float>* [[TMP23]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], <vscale x 4 x float>* [[TMP27]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10]])
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
 entry:
   %ptrint = ptrtoint float* %b to i64
   %maskcond = icmp eq i64 %ptrint, 0
@@ -207,102 +70,19 @@ for.end:                                          ; preds = %for.body
 }
 
 define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) {
+
 ; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl
+
 ; CHECK-LABEL: @predicated_noalias_scope_decl(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 495616, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult <vscale x 4 x i64> [[STEP_ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 495616, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 991232, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult <vscale x 4 x i64> [[STEP_ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 991232, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i1> [[TMP17]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.300000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.200000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <vscale x 4 x i1> [[TMP17]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.300000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.200000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP29]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = fmul <vscale x 4 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP31:%.*]] = fmul <vscale x 4 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP30]], <vscale x 4 x float>* [[TMP35]], align 4
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 4
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <vscale x 4 x float>*
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP31]], <vscale x 4 x float>* [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    br label [[IF_END5]]
-; CHECK:       if.end5:
-; CHECK-NEXT:    [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X_0]], [[TMP43]]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
+; CHECK:   vector.body:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   scalar.ph:
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   if.else:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: }
 
 entry:
   br label %for.body
@@ -340,3 +120,8 @@ for.cond.cleanup:                                 ; preds = %if.end5
 !5 = distinct !{!5, !6}
 !6 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 
+; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]}
+; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]}
+; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]}
+; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]}
+; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
index adc93ceb93743..9d0aa5ef082bc 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
@@ -5,79 +5,32 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define i8 @reduction_add_trunc(i8* noalias nocapture %A) {
 ; CHECK-LABEL: @reduction_add_trunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 256, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 256, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 256, [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 255, i32 0), [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <vscale x 8 x i32> [[VEC_PHI]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = and <vscale x 8 x i32> [[VEC_PHI1]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 8 x i32> [[TMP10]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 8 x i32> [[TMP11]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc <vscale x 8 x i32> [[TMP22]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP28]] = zext <vscale x 8 x i8> [[TMP27]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc <vscale x 8 x i32> [[TMP23]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP30]] = zext <vscale x 8 x i8> [[TMP29]] to <vscale x 8 x i32>
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP36:%.*]], %vector.body ]
+; CHECK:         [[TMP14:%.*]] = and <vscale x 8 x i32> [[VEC_PHI]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 8 x i32> [[VEC_PHI1]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK:         [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>*
+; CHECK:         [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>*
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 8 x i32> [[TMP14]], [[TMP26]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add <vscale x 8 x i32> [[TMP15]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}}
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP34]] = zext <vscale x 8 x i8> [[TMP33]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP35:%.*]] = trunc <vscale x 8 x i32> [[TMP29]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP36]] = zext <vscale x 8 x i8> [[TMP35]] to <vscale x 8 x i32>
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc <vscale x 8 x i32> [[TMP30]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i8> [[TMP32]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[BIN_RDX]])
-; CHECK-NEXT:    [[TMP34:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 255, [[ENTRY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i8, i8* [[L2]], align 4
-; CHECK-NEXT:    [[L3E:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[L9]] = add i32 [[SUM_02]], [[L3E]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[LOOP]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[SUM_0_LCSSA]] to i8
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i32> [[TMP34]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP38:%.*]] = trunc <vscale x 8 x i32> [[TMP36]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i8> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP40:%.*]] = zext i8 [[TMP39]] to i32
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
index 6d044b8041980..7fda52941072f 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
@@ -13,12 +13,12 @@ define void @trunc_minimal_bitwidth(i8* %bptr, i16* noalias %hptr, i32 %val, i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT2]] to <vscale x 4 x i16>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[HPTR:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <vscale x 4 x i16>*
 ; CHECK-NEXT:    store <vscale x 4 x i16> [[TMP4]], <vscale x 4 x i16>* [[TMP6]], align 2
@@ -91,28 +91,7 @@ define void @trunc_minimal_bitwidths_shufflevector (i8* %p, i32 %arg1, i64 %len)
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[LEN]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP13]] to i32
-; CHECK-NEXT:    [[CONV17:%.*]] = xor i32 [[CONV]], [[ARG1]]
-; CHECK-NEXT:    [[MUL18:%.*]] = mul nuw nsw i32 [[CONV17]], [[CONV]]
-; CHECK-NEXT:    [[CONV19:%.*]] = trunc i32 [[MUL18]] to i8
-; CHECK-NEXT:    store i8 [[CONV19]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
index 13d37205a3cd7..ad2571d7a34a8 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S -scalable-vectorization=on < %s 2>&1 | FileCheck %s
 
@@ -8,25 +7,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value.
 ; CHECK: LV: The Widest register safe to use is: 32 bits.
 define void @test1(i32* %a, i32* %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %loop
 

diff  --git a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
index cbf1ef9253e75..0c97e6ac475ee 100644
--- a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -S -o - < %s 2>&1 | FileCheck %s
 
@@ -9,107 +8,6 @@
 ; The bitcast below will be scalarized due to the predication in the loop. Bitcasts
 ; between pointer types should be treated as free, despite the scalarization.
 define void @foo(%struct.foo* noalias nocapture %in, i32* noalias nocapture readnone %out, i64 %n) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[IN:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i64* [[SCEVGEP]] to %struct.foo*
-; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 12, i64 [[TMP0]])
-; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[SCEVGEP12:%.*]] = bitcast %struct.foo* [[SCEVGEP1]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[SCEVGEP12]], i64 [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i8* [[TMP2]], [[SCEVGEP12]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64* [[TMP7]] to i32*
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[TMP8]] to i32*
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[TMP17]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP18]], i32 0
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP9]], align 4
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP18]], i32 1
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP10]], align 4
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP24:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP27]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP30:%.*]] = add nsw i32 [[TMP21]], -1
-; CHECK-NEXT:    store i32 [[TMP30]], i32* [[TMP9]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP28]], i32 1
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP24]], -1
-; CHECK-NEXT:    store i32 [[TMP32]], i32* [[TMP10]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_012:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[I_012]], i32 1
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i64* [[B]] to i32*
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[I_012]], i32 0
-; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[A]], align 8
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP35]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[LAND_LHS_TRUE:%.*]]
-; CHECK:       land.lhs.true:
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP34]], align 4
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[TMP36]], 0
-; CHECK-NEXT:    br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP36]], -1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[TMP34]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_012]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index 5f0aae8b52c4a..4e64e94459a1d 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -1,157 +1,116 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1
 ; RUN: opt -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2
 
 define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) {
 ; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF2IC1-NEXT:  entry:
-; CHECK-VF2IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-VF2IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF2IC1:       vector.ph:
-; CHECK-VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF2IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF2IC1:       vector.body:
-; CHECK-VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-VF2IC1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-VF2IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-VF2IC1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-VF2IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; CHECK-VF2IC1:         [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
+; CHECK-VF2IC1:         [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{%.*}}, align 4
 ; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
 ; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue
 ; CHECK-VF2IC1:       pred.load.if:
-; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[TMP0]]
+; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}}
 ; CHECK-VF2IC1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
 ; CHECK-VF2IC1-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-VF2IC1-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK-VF2IC1-NEXT:    br label %pred.load.continue
 ; CHECK-VF2IC1:       pred.load.continue:
-; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ]
 ; CHECK-VF2IC1-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2
 ; CHECK-VF2IC1:       pred.load.if1:
-; CHECK-VF2IC1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-VF2IC1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP11]]
+; CHECK-VF2IC1:         [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}}
 ; CHECK-VF2IC1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
 ; CHECK-VF2IC1-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
-; CHECK-VF2IC1-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK-VF2IC1-NEXT:    br label %pred.load.continue2
 ; CHECK-VF2IC1:       pred.load.continue2:
-; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
+; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ]
 ; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], <i32 2, i32 2>
 ; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> <i32 1, i32 1>, <2 x i32> [[VEC_PHI]]
 ; CHECK-VF2IC1-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
 ; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]]
-; CHECK-VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-VF2IC1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF2IC1:         br i1 {{%.*}}, label %middle.block, label %vector.body
 ; CHECK-VF2IC1:       middle.block:
 ; CHECK-VF2IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer
 ; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
 ; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 0
-; CHECK-VF2IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF2IC1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-VF2IC1:       scalar.ph:
-; CHECK-VF2IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF2IC1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF2IC1:         [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF2IC1-NEXT:    br label %for.body
 ; CHECK-VF2IC1:       for.body:
-; CHECK-VF2IC1-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF2IC1-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF2IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF2IC1-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-VF2IC1:         [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF2IC1:         [[TMP21:%.*]] = load i32, i32* {{%.*}}, align 4
 ; CHECK-VF2IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35
-; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label %if.then, label %for.inc
 ; CHECK-VF2IC1:       if.then:
-; CHECK-VF2IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF2IC1-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-VF2IC1:         [[TMP22:%.*]] = load i32, i32* {{%.*}}, align 4
 ; CHECK-VF2IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2
 ; CHECK-VF2IC1-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF2IC1-NEXT:    br label [[FOR_INC]]
+; CHECK-VF2IC1-NEXT:    br label %for.inc
 ; CHECK-VF2IC1:       for.inc:
-; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF2IC1-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF2IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF2IC1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
 ; CHECK-VF2IC1:       for.end.loopexit:
-; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
 ; CHECK-VF2IC1-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
 ; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF1IC2-NEXT:  entry:
-; CHECK-VF1IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-VF1IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF1IC2:       vector.ph:
-; CHECK-VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF1IC2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF1IC2:       vector.body:
-; CHECK-VF1IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE4]] ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_LOAD_CONTINUE4]] ]
-; CHECK-VF1IC2-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF1IC2-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; CHECK-VF1IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDUCTION]]
-; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[INDUCTION2]]
+; CHECK-VF1IC2:         [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue4 ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue4 ]
+; CHECK-VF1IC2:         [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 {{%.*}}
 ; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
 ; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35
 ; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue
 ; CHECK-VF1IC2:       pred.load.if:
-; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDUCTION]]
+; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}}
 ; CHECK-VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-VF1IC2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK-VF1IC2-NEXT:    br label %pred.load.continue
 ; CHECK-VF1IC2:       pred.load.continue:
-; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
+; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP5]], label %pred.load.if3, label %pred.load.continue4
 ; CHECK-VF1IC2:       pred.load.if3:
-; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[INDUCTION2]]
+; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}}
 ; CHECK-VF1IC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
-; CHECK-VF1IC2-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK-VF1IC2-NEXT:    br label %pred.load.continue4
 ; CHECK-VF1IC2:       pred.load.continue4:
-; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], [[PRED_LOAD_IF3]] ]
+; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if3 ]
 ; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
 ; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
 ; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], i32 1, i32 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI1]]
+; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI2]]
 ; CHECK-VF1IC2-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP4]], true
 ; CHECK-VF1IC2-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP5]], true
 ; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i32 [[TMP14]], i32 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI1]]
-; CHECK-VF1IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %middle.block, label %vector.body
 ; CHECK-VF1IC2:       middle.block:
 ; CHECK-VF1IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0
 ; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI5]]
-; CHECK-VF1IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF1IC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph
 ; CHECK-VF1IC2:       scalar.ph:
-; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF1IC2-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF1IC2-NEXT:    br label %for.body
 ; CHECK-VF1IC2:       for.body:
-; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF1IC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF1IC2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ]
+; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF1IC2:         [[TMP19:%.*]] = load i32, i32* {{%.*}}, align 4
 ; CHECK-VF1IC2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc
 ; CHECK-VF1IC2:       if.then:
-; CHECK-VF1IC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF1IC2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-VF1IC2:         [[TMP20:%.*]] = load i32, i32* {{%.*}}, align 4
 ; CHECK-VF1IC2-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
 ; CHECK-VF1IC2-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF1IC2-NEXT:    br label [[FOR_INC]]
+; CHECK-VF1IC2-NEXT:    br label %for.inc
 ; CHECK-VF1IC2:       for.inc:
-; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF1IC2-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %for.body
 ; CHECK-VF1IC2:       for.end.loopexit:
-; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
 ; CHECK-VF1IC2-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index 8c8638f7b44c8..b35b0a0696fe8 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -24,23 +24,23 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:    [[TMP2]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -49,9 +49,9 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[SEL]] = select i1 [[SEL_COND]], i32 [[NEXT]], i32 10
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit.loopexit:
-; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ 0, [[CHECK]] ], [ [[SEL_LCSSA]], [[EXIT_LOOPEXIT]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index 25e9b90804d50..be799a79520b9 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -17,24 +17,24 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <2 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI]], <2 x i16>* [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <2 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI]], <2 x i16>* [[TMP11]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32, 32
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -59,7 +59,7 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    store i16 [[RES]], i16* [[DST_PTR]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -106,57 +106,32 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, i16* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <2 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP9]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP7]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP10]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP11]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP13]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], <2 x i16>* [[TMP14]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, i16* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP8]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP12]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], <2 x i16>* [[TMP15]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32, 32
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.cond:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i16 [ 0, [[LOOP_HEADER]] ], [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT:    store i16 [[RES]], i16* [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -203,52 +178,30 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, i16* noalias %dst) {
 ; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> [[TMP10]], <2 x i16>* [[TMP13]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 1
+; CHECK-NEXT:    [[INS1:%.+]] = insertelement <2 x i16> poison, i16 [[TMP8]], i32 0
+; CHECK-NEXT:    [[INS2:%.+]] = insertelement <2 x i16> [[INS1]], i16 [[TMP9]], i32 1
+; CHECK-NEXT:    [[DST0:%.+]] = getelementptr inbounds i16, i16* %dst, i64 [[TMP0]]
+; CHECK-NEXT:    [[DST1:%.+]] = getelementptr inbounds i16, i16* [[DST0]], i32 0
+; CHECK-NEXT:    [[DST1_BC:%.+]] = bitcast i16* [[DST1]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> [[INS2]], <2 x i16>* [[DST1_BC]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], <i16 2, i16 2>
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i16> [[VEC_IND3]], <i16 2, i16 2>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32, 32
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_A]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds i16, i16* [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i16 [[LV]], i16* [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -292,71 +245,46 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
 ; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE4]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP7]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> [[TMP14]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> [[TMP13]], <2 x i16> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16* [[TMP20]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI5]], <2 x i16>* [[TMP21]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[TMP15]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP17]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP19]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[TMP21]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI5]], <2 x i16>* [[TMP22]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], <i16 2, i16 2>
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.cond:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i16 [ 0, [[LOOP_HEADER]] ], [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT:    store i16 [[RES]], i16* [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 63
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -399,34 +327,18 @@ define void @duplicated_incoming_blocks_blend(i32 %x, i32* %ptr) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1000, 1000
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[IV]], [[X:%.*]]
-; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[IV]], [[LOOP_HEADER]] ], [ [[IV]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr i32, i32* [[PTR]], i32 [[P]]
-; CHECK-NEXT:    store i32 [[P]], i32* [[GEP_PTR]], align 4
-; CHECK-NEXT:    [[ADD_I]] = add nsw i32 [[P]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD_I]], 1000
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header

diff  --git a/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll b/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll
index 1ce8a7dfb0204..414a026fcf9b3 100644
--- a/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -14,63 +13,18 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; a shuffle too many.
 
 define void @t()  {
-; CHECK-LABEL: @t(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 94, [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @uf, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @xi, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 -3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @q, i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i64 -3
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP7]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 88
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 5, [[MIDDLE_BLOCK]] ], [ 93, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @uf, i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @xi, i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @q, i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[ADD8:%.*]] = add nsw i32 [[ADD4]], [[TMP18]]
-; CHECK-NEXT:    store i32 [[ADD8]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP19]], 2
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 
+; CHECK-LABEL: @t(
+; CHECK: vector.body:
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR3:%[a-zA-Z0-9]+]] = add nsw <4 x i32> [[VAR2]], [[VAR1]]
+; CHECK: store <4 x i32> [[VAR3]]
+; CHECK: [[VAR4:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR4]]
+; CHECK-NOT: shufflevector
 
 for.body:
   %indvars.iv = phi i64 [ 93, %entry ], [ %indvars.iv.next, %for.body ]

diff  --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
index a1fb5db8fec0d..1904f89880eb6 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
@@ -16,42 +16,42 @@ define void @VF1-VPlanExe(i32* %dst) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule i64 [[INDUCTION]], 14
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[INDUCTION1]], 14
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[INDUCTION2]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i64 [[INDUCTION3]], 14
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[INDUCTION]]
+; CHECK-NEXT:    [[SUNK_IND0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[SUNK_IND0]]
 ; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION1]]
+; CHECK-NEXT:    [[SUNK_IND1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND1]]
 ; CHECK-NEXT:    store i32 0, i32* [[TMP5]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE5]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION2]]
+; CHECK-NEXT:    [[SUNK_IND2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND2]]
 ; CHECK-NEXT:    store i32 0, i32* [[TMP6]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE7]]
 ; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
 ; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION3]]
+; CHECK-NEXT:    [[SUNK_IND3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND3]]
 ; CHECK-NEXT:    store i32 0, i32* [[TMP7]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
 ; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16

diff  --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
index f41db0cc262b6..638d094a6ff1a 100644
--- a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
+++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -tbaa -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
 ; RUN: opt < %s  -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefix=CHECK-NOTBAA
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -6,64 +5,27 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; TBAA partitions the accesses in this loop, so it can be vectorized without
 ; runtime checks.
 define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
-; CHECK-NOTBAA-LABEL: @test1(
-; CHECK-NOTBAA-NEXT:  entry:
-; CHECK-NOTBAA-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT:    [[TMP0:%.*]] = bitcast float* [[SCEVGEP4]] to i32*
-; CHECK-NOTBAA-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[TMP0]], [[A]]
-; CHECK-NOTBAA-NEXT:    [[TMP1:%.*]] = bitcast i32* [[SCEVGEP]] to float*
-; CHECK-NOTBAA-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[TMP1]], [[B]]
-; CHECK-NOTBAA-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NOTBAA-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK-NOTBAA:       vector.body:
-; CHECK-NOTBAA-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NOTBAA-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NOTBAA-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]], !alias.scope !4
-; CHECK-NOTBAA-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NOTBAA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NOTBAA-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA7:![0-9]+]], !alias.scope !9, !noalias !4
-; CHECK-NOTBAA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NOTBAA-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NOTBAA-NEXT:    br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-NOTBAA:       for.body:
-; CHECK-NOTBAA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NOTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NOTBAA-NEXT:    [[CONV:%.*]] = fptosi float [[TMP8]] to i32
-; CHECK-NOTBAA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA7]]
-; CHECK-NOTBAA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NOTBAA-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NOTBAA-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-NOTBAA:       for.end:
-; CHECK-NOTBAA-NEXT:    ret i32 0
-;
+; CHECK-LABEL: @test1
+; CHECK: entry:
+; CHECK-NEXT: br label %vector.body
+; CHECK: vector.body:
 
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
 
+; CHECK: ret i32 0
 
+; CHECK-NOTBAA-LABEL: @test1
+; CHECK-NOTBAA: entry:
+; CHECK-NOTBAA: icmp ugt i32*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA-NOT: icmp
+; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body
 
+; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
 
+; CHECK-NOTBAA: ret i32 0
 
 entry:
   br label %for.body
@@ -86,98 +48,31 @@ for.end:                                          ; preds = %for.body
 ; This test is like the first, except here there is still one runtime check
 ; required. Without TBAA, however, two checks are required.
 define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[C:%.*]], i64 1600
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 1600
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[C]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4, !tbaa [[TBAA0]], !alias.scope !8
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]], !alias.scope !11, !noalias !8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to float
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP9]], [[CONV]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX4]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
-; CHECK-NOTBAA-LABEL: @test2(
-; CHECK-NOTBAA-NEXT:  entry:
-; CHECK-NOTBAA-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[C:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[C]]
-; CHECK-NOTBAA-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
-; CHECK-NOTBAA-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NOTBAA-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP7]] to float*
-; CHECK-NOTBAA-NEXT:    [[BOUND09:%.*]] = icmp ugt float* [[TMP0]], [[C]]
-; CHECK-NOTBAA-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
-; CHECK-NOTBAA-NEXT:    [[BOUND110:%.*]] = icmp ugt i32* [[TMP1]], [[A]]
-; CHECK-NOTBAA-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NOTBAA-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NOTBAA-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK-NOTBAA:       vector.body:
-; CHECK-NOTBAA-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NOTBAA-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NOTBAA-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !tbaa [[TBAA0]], !alias.scope !14
-; CHECK-NOTBAA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NOTBAA-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA7]], !alias.scope !17
-; CHECK-NOTBAA-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x float>
-; CHECK-NOTBAA-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[TMP6]]
-; CHECK-NOTBAA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NOTBAA-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]], !alias.scope !19, !noalias !21
-; CHECK-NOTBAA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NOTBAA-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NOTBAA-NEXT:    br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NOTBAA:       for.body:
-; CHECK-NOTBAA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NOTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NOTBAA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA7]]
-; CHECK-NOTBAA-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to float
-; CHECK-NOTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP11]], [[CONV]]
-; CHECK-NOTBAA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT:    store float [[MUL]], float* [[ARRAYIDX4]], align 4, !tbaa [[TBAA0]]
-; CHECK-NOTBAA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NOTBAA-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NOTBAA-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-NOTBAA:       for.end:
-; CHECK-NOTBAA-NEXT:    ret i32 0
-;
-
-
-
-
-
+; CHECK-LABEL: @test2
+; CHECK: entry:
+; CHECK: icmp ugt float*
+; CHECK: icmp ugt float*
+; CHECK-NOT: icmp uge i32*
+; CHECK: br i1 {{.+}}, label %for.body, label %vector.body
+
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK: ret i32 0
+
+; CHECK-NOTBAA-LABEL: @test2
+; CHECK-NOTBAA: entry:
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt i32*
+; CHECK-NOTBAA-NOT: icmp
+; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body
+
+; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK-NOTBAA: ret i32 0
 
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
index 4d1aa7e447f63..e19f4aa85c02c 100644
--- a/llvm/test/Transforms/LoopVectorize/tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This test verifies that the loop vectorizer will not vectorizes low trip count
 ; loops that require runtime checks (Trip count is computed with profile info).
 ; REQUIRES: asserts
@@ -10,22 +9,9 @@ target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
 
 define i32 @foo_low_trip_count1(i32 %bound) {
 ; Simple loop with low tripcount. Should not be vectorized.
+
 ; CHECK-LABEL: @foo_low_trip_count1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -48,22 +34,9 @@ for.end:                                          ; preds = %for.body
 define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
 ; The loop has a same invocation count with the function, but has a low
 ; trip_count per invocation and not worth to vectorize.
+
 ; CHECK-LABEL: @foo_low_trip_count2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -86,52 +59,12 @@ for.end:                                          ; preds = %for.body
 define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
 ; The loop has low invocation count compare to the function invocation count,
 ; but has a high trip count per invocation. Vectorize it.
+
 ; CHECK-LABEL: @foo_low_trip_count3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR_PREHEADER:%.*]], label [[FOR_END:%.*]], !prof [[PROF2:![0-9]+]]
-; CHECK:       for.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[BOUND:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
 entry:
   br i1 %cond, label %for.preheader, label %for.end, !prof !2
 
@@ -156,22 +89,9 @@ for.end:                                          ; preds = %for.body
 define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) {
 ; Simple loop with low tripcount and inequality test for exit.
 ; Should not be vectorized.
+
 ; CHECK-LABEL: @foo_low_trip_count_icmp_sgt(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sgt i32 [[I_08]], [[BOUND:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -193,22 +113,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_low_trip_count() {
 ; Simple loop with constant, small trip count and no profiling info.
-; CHECK-LABEL: @const_low_trip_count(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 2
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_low_trip_count
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -230,44 +137,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_large_trip_count() {
 ; Simple loop with constant large trip count and no profiling info.
-; CHECK-LABEL: @const_large_trip_count(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1001, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_large_trip_count
+; CHECK: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -289,22 +161,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_small_trip_count_step() {
 ; Simple loop with static, small trip count and no profiling info.
-; CHECK-LABEL: @const_small_trip_count_step(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 5
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 10
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_small_trip_count_step
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -326,44 +185,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_trip_over_profile() {
 ; constant trip count takes precedence over profile data
-; CHECK-LABEL: @const_trip_over_profile(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1001, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !prof [[PROF0]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_trip_over_profile
+; CHECK: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -383,6 +207,8 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490}
+; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0}
 ; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000,
 ; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001.
 ; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1

diff  --git a/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll b/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
index 7f5a7123bab8b..92cb0ba8b10ea 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
@@ -147,24 +147,9 @@ for.end:
 
 define i8 @reduction_smin_trunc(i8* noalias nocapture %ptr) {
 ; CHECK-LABEL: @reduction_smin_trunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 256, [[ENTRY]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1
-; CHECK-NEXT:    [[EXT:%.*]] = sext i8 [[LOAD]] to i32
-; CHECK-NEXT:    [[TMP1]] = call i32 @llvm.smin.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[TMP1]] to i8
-; CHECK-NEXT:    ret i8 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -188,24 +173,9 @@ for.end:
 
 define i8 @reduction_umin_trunc(i8* noalias nocapture %ptr) {
 ; CHECK-LABEL: @reduction_umin_trunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1
-; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[LOAD]] to i32
-; CHECK-NEXT:    [[TMP1]] = call i32 @llvm.umin.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[TMP1]] to i8
-; CHECK-NEXT:    ret i8 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -229,24 +199,9 @@ for.end:
 
 define i16 @reduction_smax_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-LABEL: @reduction_smax_trunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[EXT:%.*]] = sext i16 [[LOAD]] to i32
-; CHECK-NEXT:    [[TMP1]] = call i32 @llvm.smax.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT:    ret i16 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
 entry:
   br label %for.body
 
@@ -270,24 +225,9 @@ for.end:
 
 define i16 @reduction_umax_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-LABEL: @reduction_umax_trunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[LOAD]] to i32
-; CHECK-NEXT:    [[TMP1]] = call i32 @llvm.umax.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT:    ret i16 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 87532b728d0cd..91bb543dcd46a 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -1,55 +1,30 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
 
 @dst = external global [32 x i16], align 1
 
 define void @blend_uniform_iv_trunc(i1 %c) {
 ; CHECK-LABEL: @blend_uniform_iv_trunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i32 0
+; CHECK-NEXT:    [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
+
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> [[BROADCAST_SPLAT2]], <4 x i16> undef
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, <4 x i16>* [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[MASK1]], <4 x i16> [[BROADCAST_SPLAT2]], <4 x i16> undef
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i16 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, <4 x i16>* [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32, 32
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ undef, [[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    store i16 0, i16* [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP8]], label %middle.block, label %vector.body
 ;
-
 entry:
   br label %loop.header
 
@@ -75,19 +50,17 @@ exit:                                             ; preds = %loop.latch
 
 define void @blend_uniform_iv(i1 %c) {
 ; CHECK-LABEL: @blend_uniform_iv(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i32 0
+; CHECK-NEXT:    [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
+
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[BROADCAST_SPLAT2]], <4 x i64> undef
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[MASK1]], <4 x i64> [[BROADCAST_SPLAT2]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
@@ -95,29 +68,8 @@ define void @blend_uniform_iv(i1 %c) {
 ; CHECK-NEXT:    store <4 x i16> zeroinitializer, <4 x i16>* [[TMP5]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32, 32
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i64 [ undef, [[LOOP_HEADER]] ], [ [[IV]], [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[BLEND]]
-; CHECK-NEXT:    store i16 0, i16* [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br i1 [[TMP6]], label %middle.block, label %vector.body
 ;
-
 entry:
   br label %loop.header
 
@@ -142,65 +94,37 @@ exit:                                             ; preds = %loop.latch
 
 define void @blend_chain_iv(i1 %c) {
 ; CHECK-LABEL: @blend_chain_iv(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i32 0
+; CHECK-NEXT:    [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
+
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> undef
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i1> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[PREDPHI]], <4 x i64> undef
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[MASK1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> undef
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[TMP8]], <4 x i64> [[PREDPHI]], <4 x i64> undef
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP11]]
-; CHECK-NEXT:    store i16 0, i16* [[TMP6]], align 2
-; CHECK-NEXT:    store i16 0, i16* [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP15]]
 ; CHECK-NEXT:    store i16 0, i16* [[TMP10]], align 2
 ; CHECK-NEXT:    store i16 0, i16* [[TMP12]], align 2
+; CHECK-NEXT:    store i16 0, i16* [[TMP14]], align 2
+; CHECK-NEXT:    store i16 0, i16* [[TMP16]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32, 32
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_NEXT_2:%.*]], label [[LOOP_NEXT_3:%.*]]
-; CHECK:       loop.next.2:
-; CHECK-NEXT:    br label [[LOOP_NEXT_3]]
-; CHECK:       loop.next.3:
-; CHECK-NEXT:    [[BLEND_1:%.*]] = phi i64 [ undef, [[LOOP_NEXT]] ], [ [[IV]], [[LOOP_NEXT_2]] ]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i64 [ undef, [[LOOP_HEADER]] ], [ [[BLEND_1]], [[LOOP_NEXT_3]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[BLEND]]
-; CHECK-NEXT:    store i16 0, i16* [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP17]], label %middle.block, label %vector.body
 ;
-
 entry:
   br label %loop.header
 

diff  --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
index 04cb83c477b90..bbcca51019e64 100644
--- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -26,18 +26,18 @@ define void @basic_loop(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -50,8 +50,8 @@ define void @basic_loop(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
 ; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP7]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
 ; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       end:
@@ -92,18 +92,18 @@ define void @metadata(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -116,8 +116,8 @@ define void @metadata(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
 ; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP7]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
 ; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       end:

diff  --git a/llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll b/llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll
index 658ffeef0ec54..e3168d5b5d350 100644
--- a/llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll
@@ -1,65 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 %s1 = type { [32000 x double], [32000 x double], [32000 x double] }
 
 define i32 @load_with_pointer_phi_no_runtime_checks(%s1* %data) {
-; CHECK-LABEL: @load_with_pointer_phi_no_runtime_checks(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 15999, i64 15999>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[S1:%.*]], %s1* [[DATA:%.*]], i64 0, i32 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x double*> [[TMP4]], <2 x double*> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> <double 3.000000e+00, double 3.000000e+00>, [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP14]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32000, 32000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp ult i64 [[IV]], 15999
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT:    br i1 [[CMP5]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, i64 [[IV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[GEP_2_SINK:%.*]] = phi double* [ [[GEP_2]], [[IF_ELSE]] ], [ [[GEP_1]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[V8:%.*]] = load double, double* [[GEP_2_SINK]], align 8
-; CHECK-NEXT:    [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT:    store double [[MUL16]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 10
+; CHECK-LABEL: @load_with_pointer_phi_no_runtime_checks
+; CHECK-NOT: memcheck
+; CHECK:     vector.body:
 ;
 entry:
   br label %loop.header
@@ -92,62 +38,9 @@ exit:                                             ; preds = %loop.latch
 }
 
 define i32 @store_with_pointer_phi_no_runtime_checks(%s1* %data) {
-; CHECK-LABEL: @store_with_pointer_phi_no_runtime_checks(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 15999, i64 15999>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[S1:%.*]], %s1* [[DATA:%.*]], i64 0, i32 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x double*> [[TMP4]], <2 x double*> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> <double 3.000000e+00, double 3.000000e+00>, [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store double [[TMP9]], double* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store double [[TMP11]], double* [[TMP12]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32000, 32000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp ult i64 [[IV]], 15999
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT:    br i1 [[CMP5]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, i64 [[IV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[GEP_2_SINK:%.*]] = phi double* [ [[GEP_2]], [[IF_ELSE]] ], [ [[GEP_1]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[V8:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT:    store double [[MUL16]], double* [[GEP_2_SINK]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 10
+; CHECK-LABEL: @store_with_pointer_phi_no_runtime_checks
+; CHECK-NOT: memcheck
+; CHECK:     vector.body
 ;
 entry:
   br label %loop.header
@@ -180,84 +73,9 @@ exit:                                             ; preds = %loop.latch
 }
 
 define i32 @store_with_pointer_phi_runtime_checks(double* %A, double* %B, double* %C) {
-; CHECK-LABEL: @store_with_pointer_phi_runtime_checks(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B1:%.*]] = bitcast double* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[C3:%.*]] = bitcast double* [[C:%.*]] to i8*
-; CHECK-NEXT:    [[A6:%.*]] = bitcast double* [[A:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[B]], i64 32000
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[C]], i64 32000
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[A]], i64 32000
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[C3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[A6]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult i8* [[C3]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult i8* [[A6]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 15999, i64 15999>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[C]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[B]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x double*> [[TMP4]], <2 x double*> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8, !alias.scope !6
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> <double 3.000000e+00, double 3.000000e+00>, [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store double [[TMP9]], double* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store double [[TMP11]], double* [[TMP12]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 32000, 32000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp ult i64 [[IV]], 15999
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    br i1 [[CMP5]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[IV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[IV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[GEP_2_SINK:%.*]] = phi double* [ [[GEP_2]], [[IF_ELSE]] ], [ [[GEP_1]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[V8:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT:    store double [[MUL16]], double* [[GEP_2_SINK]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 10
+; CHECK-LABEL: @store_with_pointer_phi_runtime_checks
+; CHECK:     memcheck
+; CHECK:     vector.body
 ;
 entry:
   br label %loop.header
@@ -290,28 +108,8 @@ exit:                                             ; preds = %loop.latch
 }
 
 define i32 @load_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: @load_with_pointer_phi_outside_loop(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C_0:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[LOOP_PH:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[PTR_SELECT:%.*]] = select i1 [[C_1:%.*]], double* [[C:%.*]], double* [[B:%.*]]
-; CHECK-NEXT:    br label [[LOOP_PH]]
-; CHECK:       loop.ph:
-; CHECK-NEXT:    [[PTR:%.*]] = phi double* [ [[A:%.*]], [[IF_THEN]] ], [ [[PTR_SELECT]], [[IF_ELSE]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[V8:%.*]] = load double, double* [[PTR]], align 8
-; CHECK-NEXT:    [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT:    store double [[MUL16]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 10
+; CHECK-LABEL: @load_with_pointer_phi_outside_loop
+; CHECK-NOT: vector.body
 ;
 entry:
   br i1 %c.0, label %if.then, label %if.else
@@ -342,28 +140,8 @@ exit:                                             ; preds = %loop.latch
 }
 
 define i32 @store_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: @store_with_pointer_phi_outside_loop(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C_0:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[LOOP_PH:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[PTR_SELECT:%.*]] = select i1 [[C_1:%.*]], double* [[C:%.*]], double* [[B:%.*]]
-; CHECK-NEXT:    br label [[LOOP_PH]]
-; CHECK:       loop.ph:
-; CHECK-NEXT:    [[PTR:%.*]] = phi double* [ [[A:%.*]], [[IF_THEN]] ], [ [[PTR_SELECT]], [[IF_ELSE]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[V8:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT:    store double [[MUL16]], double* [[PTR]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 10
+; CHECK-LABEL: @store_with_pointer_phi_outside_loop
+; CHECK-NOT: vector.body
 ;
 entry:
   br i1 %c.0, label %if.then, label %if.else

diff  --git a/llvm/test/Transforms/LoopVectorize/vectorizeVFone.ll b/llvm/test/Transforms/LoopVectorize/vectorizeVFone.ll
index eacd2645c1d81..ea86c410731b8 100644
--- a/llvm/test/Transforms/LoopVectorize/vectorizeVFone.ll
+++ b/llvm/test/Transforms/LoopVectorize/vectorizeVFone.ll
@@ -1,29 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=loop-vectorize -S 2>&1 | FileCheck %s
 
 %type = type { [3 x double] }
 
 define void @getScalarFunc(double* %A, double* %C, %type* %B) {
-; CHECK-LABEL: @getScalarFunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[DUMMYLOAD2:%.*]] = load double, double* [[A:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX_I24:%.*]] = getelementptr inbounds [[TYPE:%.*]], %type* [[B:%.*]], i64 [[I]], i32 0, i32 0
-; CHECK-NEXT:    [[_15:%.*]] = load double, double* [[ARRAYIDX_I24]], align 8
-; CHECK-NEXT:    [[CALL10:%.*]] = tail call fast double @atan(double [[_15]]) #[[ATTR0:[0-9]+]]
-; CHECK-NEXT:    [[INC]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 1000, [[INC]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: getScalarFunc
 ; This check will catch also the massv version of the function.
+; CHECK-NOT: call fast <{{[0-9]+}} x double> @{{.*}}atan(<{{[0-9]+}} x double> %{{[0-9]+}})
 entry:
   br label %for.body
 
-for.body:
+for.body: 
   %i = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %dummyload2 = load double, double* %A, align 8
   %arrayidx.i24 = getelementptr inbounds %type, %type* %B, i64 %i, i32 0, i32 0
@@ -33,7 +19,7 @@ for.body:
   %cmp = icmp ugt i64 1000, %inc
   br i1 %cmp, label %for.body, label %for.end
 
-for.end:
+for.end: 
   ret void
 }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/vplan-outer-loop-uncomputable-trip-count.ll b/llvm/test/Transforms/LoopVectorize/vplan-outer-loop-uncomputable-trip-count.ll
index e930ab9880514..817d0b00ffca8 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-outer-loop-uncomputable-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-outer-loop-uncomputable-trip-count.ll
@@ -19,7 +19,7 @@ define void @test() {
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR1_LATCH:%.*]], label [[FOR2_HEADER]]
 ; CHECK:       for1.latch:
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
-; CHECK-NEXT:    br i1 [[C]], label [[EXIT:%.*]], label [[FOR1_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[EXIT:%.*]], label [[FOR1_HEADER]], !llvm.loop !0
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
index 0a984b75945e3..4303f14ab6fed 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
 
 ; Vectorize explict marked outer loop using vplan native path. Inner loop
@@ -18,69 +17,35 @@
 ; }
 define void @inner_loop_reduction(double* noalias nocapture readonly %a.in, double* noalias nocapture readonly %b.in, double* noalias nocapture %c.out) {
 ; CHECK-LABEL: @inner_loop_reduction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, double* [[A_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[B_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT:    br label [[FOR2_HEADER2:%.*]]
-; CHECK:       for2.header2:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x double> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP2:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT:    [[TMP2]] = fadd <4 x double> [[WIDE_MASKED_GATHER1]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP3]], <i32 10000, i32 10000, i32 10000, i32 10000>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR1_LATCH4]], label [[FOR2_HEADER2]]
-; CHECK:       for1.latch4:
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x double> [ [[TMP2]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[C_OUT:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[VEC_PHI5]], <4 x double*> [[TMP6]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
-; CHECK:       for1.header:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds double, double* [[A_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT:    [[A:%.*]] = load double, double* [[A_PTR]], align 8
-; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds double, double* [[B_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT:    [[B:%.*]] = load double, double* [[B_PTR]], align 8
-; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
-; CHECK:       for2.header:
-; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i32 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[A_REDUCTION:%.*]] = phi double [ [[A]], [[FOR1_HEADER]] ], [ [[A_REDUCTION1:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[A_REDUCTION1]] = fadd double [[B]], [[A_REDUCTION]]
-; CHECK-NEXT:    [[INDVAR21]] = add nuw nsw i32 [[INDVAR2]], 1
-; CHECK-NEXT:    [[FOR2_COND:%.*]] = icmp eq i32 [[INDVAR21]], 10000
-; CHECK-NEXT:    br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK:       for1.latch:
-; CHECK-NEXT:    [[A_REDUCTION1_LCSSA:%.*]] = phi double [ [[A_REDUCTION1]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[C_PTR:%.*]] = getelementptr inbounds double, double* [[C_OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT:    store double [[A_REDUCTION1_LCSSA]], double* [[C_PTR]], align 8
-; CHECK-NEXT:    [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT:    br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 
+; CHECK: vector.body:
+; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
+; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
+; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, double* %a.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, double* %b.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
+; CHECK: [[FOR2_HEADER]]:
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]]
+; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
 
+; CHECK: [[FOR1_LATCH]]:
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, double* %c.out, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
+; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
+; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
 
 entry:
   br label %for1.header

diff  --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
index 240f78a8319ce..454760935a707 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
 
 ; Test that VPlan native path is able to widen call intructions like
@@ -7,71 +6,36 @@
 declare double @llvm.sqrt.f64(double %0)
 define void @widen_call_instruction(double* noalias nocapture readonly %a.in, double* noalias nocapture readonly %b.in, double* noalias nocapture %c.out) {
 ; CHECK-LABEL: @widen_call_instruction(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, double* [[A_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[B_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[WIDE_MASKED_GATHER1]])
-; CHECK-NEXT:    br label [[FOR2_HEADER2:%.*]]
-; CHECK:       for2.header2:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x double> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT:    [[TMP3]] = fadd <4 x double> [[TMP2]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], <i32 10000, i32 10000, i32 10000, i32 10000>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR1_LATCH4]], label [[FOR2_HEADER2]]
-; CHECK:       for1.latch4:
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x double> [ [[TMP3]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[C_OUT:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[VEC_PHI5]], <4 x double*> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
-; CHECK:       for1.header:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds double, double* [[A_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT:    [[A:%.*]] = load double, double* [[A_PTR]], align 8
-; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds double, double* [[B_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT:    [[B:%.*]] = load double, double* [[B_PTR]], align 8
-; CHECK-NEXT:    [[B_SQRT:%.*]] = call double @llvm.sqrt.f64(double [[B]])
-; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
-; CHECK:       for2.header:
-; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i32 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[A_REDUCTION:%.*]] = phi double [ [[A]], [[FOR1_HEADER]] ], [ [[A_REDUCTION1:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[A_REDUCTION1]] = fadd double [[B_SQRT]], [[A_REDUCTION]]
-; CHECK-NEXT:    [[INDVAR21]] = add nuw nsw i32 [[INDVAR2]], 1
-; CHECK-NEXT:    [[FOR2_COND:%.*]] = icmp eq i32 [[INDVAR21]], 10000
-; CHECK-NEXT:    br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK:       for1.latch:
-; CHECK-NEXT:    [[A_REDUCTION1_LCSSA:%.*]] = phi double [ [[A_REDUCTION1]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[C_PTR:%.*]] = getelementptr inbounds double, double* [[C_OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT:    store double [[A_REDUCTION1_LCSSA]], double* [[C_PTR]], align 8
-; CHECK-NEXT:    [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT:    br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 
+; CHECK: vector.body:
+; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
+; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
+; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, double* %a.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, double* %b.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_SQRT:.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %[[MASKED_GATHER2]])
+; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
+; CHECK: [[FOR2_HEADER]]:
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[B_SQRT]], %[[REDUCTION]]
+; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
 
+; CHECK: [[FOR1_LATCH]]:
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, double* %c.out, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
+; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
+; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
 
 entry:
   br label %for1.header

diff  --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
index 6c1e830147ec1..c8235e53de482 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
 
 ; Test that VPlan native path is able to widen select instruction in the
@@ -29,41 +28,6 @@ define void @loop_invariant_select(double* noalias nocapture %out, i1 %select, d
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP2:%.*]], [[FOR2_HEADER1]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[SELECT:%.*]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP1]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK:       for1.latch4:
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
-; CHECK:       for1.header:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
-; CHECK:       for2.header:
-; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT:    store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT:    [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT:    [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT:    br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK:       for1.latch:
-; CHECK-NEXT:    [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT:    br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for1.header
 
@@ -110,42 +74,6 @@ define void @outer_loop_dependant_select(double* noalias nocapture %out, double
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP2]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK:       for1.latch4:
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
-; CHECK:       for1.header:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
-; CHECK:       for2.header:
-; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[SELECT:%.*]] = trunc i64 [[INDVAR1]] to i1
-; CHECK-NEXT:    [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT:    store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT:    [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT:    [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT:    br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK:       for1.latch:
-; CHECK-NEXT:    [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT:    br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for1.header
 
@@ -193,42 +121,6 @@ define void @inner_loop_dependant_select(double* noalias nocapture %out, double
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP2]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK:       for1.latch4:
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
-; CHECK:       for1.header:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
-; CHECK:       for2.header:
-; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[SELECT:%.*]] = trunc i64 [[INDVAR2]] to i1
-; CHECK-NEXT:    [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT:    store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT:    [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT:    [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT:    br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK:       for1.latch:
-; CHECK-NEXT:    [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT:    br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for1.header
 
@@ -277,43 +169,6 @@ define void @outer_and_inner_loop_dependant_select(double* noalias nocapture %ou
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP3]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK:       for1.latch4:
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
-; CHECK:       for1.header:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
-; CHECK:       for2.header:
-; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = add nuw nsw i64 [[INDVAR1]], [[INDVAR2]]
-; CHECK-NEXT:    [[SELECT:%.*]] = trunc i64 [[SUM]] to i1
-; CHECK-NEXT:    [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT:    store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT:    [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT:    [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT:    br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK:       for1.latch:
-; CHECK-NEXT:    [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT:    br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for1.header
 

diff  --git a/llvm/test/Transforms/SROA/address-spaces.ll b/llvm/test/Transforms/SROA/address-spaces.ll
index e84d6120d69d3..4303a924595a5 100644
--- a/llvm/test/Transforms/SROA/address-spaces.ll
+++ b/llvm/test/Transforms/SROA/address-spaces.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-p1:16:16:16-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
@@ -11,11 +10,9 @@ declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* nocapture, i8 addrspace
 ; Make sure an illegal bitcast isn't introduced
 define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)* %b) {
 ; CHECK-LABEL: @test_address_space_1_1(
-; CHECK-NEXT:    [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64> addrspace(1)* [[A:%.*]], align 2
-; CHECK-NEXT:    [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16 addrspace(1)* [[B:%.*]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT:    store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64> addrspace(1)* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT:    ret void
-;
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
   %aa = alloca <2 x i64>, align 16
   %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
   %aaptr = bitcast <2 x i64>* %aa to i8*
@@ -27,11 +24,9 @@ define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)
 
 define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16* %b) {
 ; CHECK-LABEL: @test_address_space_1_0(
-; CHECK-NEXT:    [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64> addrspace(1)* [[A:%.*]], align 2
-; CHECK-NEXT:    [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16* [[B:%.*]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64>* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT:    ret void
-;
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
   %aa = alloca <2 x i64>, align 16
   %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
   %aaptr = bitcast <2 x i64>* %aa to i8*
@@ -43,11 +38,9 @@ define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16* %b) {
 
 define void @test_address_space_0_1(<2 x i64>* %a, i16 addrspace(1)* %b) {
 ; CHECK-LABEL: @test_address_space_0_1(
-; CHECK-NEXT:    [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64>* [[A:%.*]], align 2
-; CHECK-NEXT:    [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16 addrspace(1)* [[B:%.*]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT:    store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64> addrspace(1)* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT:    ret void
-;
+; CHECK: load <2 x i64>, <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
   %aa = alloca <2 x i64>, align 16
   %aptr = bitcast <2 x i64>* %a to i8*
   %aaptr = bitcast <2 x i64>* %aa to i8*
@@ -62,18 +55,7 @@ define void @test_address_space_0_1(<2 x i64>* %a, i16 addrspace(1)* %b) {
 ; Function Attrs: nounwind
 define void @copy_struct([5 x i64] %in.coerce) {
 ; CHECK-LABEL: @copy_struct(
-; CHECK-NEXT:  for.end:
-; CHECK-NEXT:    [[IN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE:%.*]], 0
-; CHECK-NEXT:    [[IN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE]], 1
-; CHECK-NEXT:    [[IN_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE]], 2
-; CHECK-NEXT:    [[IN_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE]], 3
-; CHECK-NEXT:    [[IN_SROA_2_4_EXTRACT_SHIFT:%.*]] = lshr i64 [[IN_COERCE_FCA_2_EXTRACT]], 32
-; CHECK-NEXT:    [[IN_SROA_2_4_EXTRACT_TRUNC:%.*]] = trunc i64 [[IN_SROA_2_4_EXTRACT_SHIFT]] to i32
-; CHECK-NEXT:    store i32 [[IN_SROA_2_4_EXTRACT_TRUNC]], i32 addrspace(1)* undef, align 4
-; CHECK-NEXT:    store i64 [[IN_COERCE_FCA_3_EXTRACT]], i64 addrspace(1)* poison, align 4
-; CHECK-NEXT:    store i32 undef, i32 addrspace(1)* poison, align 4
-; CHECK-NEXT:    ret void
-;
+; CHECK-NOT: memcpy
 for.end:
   %in = alloca %struct.struct_test_27.0.13, align 8
   %0 = bitcast %struct.struct_test_27.0.13* %in to [5 x i64]*
@@ -83,7 +65,7 @@ for.end:
   call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 4 undef, i8* align 4 %scevgep910, i32 16, i1 false)
   ret void
 }
-
+ 
 %union.anon = type { i32* }
 
 @g = common global i32 0, align 4
@@ -93,12 +75,8 @@ for.end:
 ; illegal bitcast isn't introduced
 define void @pr27557() {
 ; CHECK-LABEL: @pr27557(
-; CHECK-NEXT:    [[DOTSROA_0:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    store i32* @g, i32** [[DOTSROA_0]], align 8
-; CHECK-NEXT:    [[DOTSROA_0_0__SROA_CAST1:%.*]] = bitcast i32** [[DOTSROA_0]] to i32 addrspace(3)**
-; CHECK-NEXT:    store i32 addrspace(3)* @l, i32 addrspace(3)** [[DOTSROA_0_0__SROA_CAST1]], align 8
-; CHECK-NEXT:    ret void
-;
+; CHECK: %[[CAST:.*]] = bitcast i32** {{.*}} to i32 addrspace(3)**
+; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)** %[[CAST]]
   %1 = alloca %union.anon, align 8
   %2 = bitcast %union.anon* %1 to i32**
   store i32* @g, i32** %2, align 8
@@ -113,8 +91,7 @@ define void @pr27557() {
 ; should be promoted through the pair of `ptrtoint`/`inttoptr`.
 define i32* @pr27557.alt() {
 ; CHECK-LABEL: @pr27557.alt(
-; CHECK-NEXT:    ret i32* inttoptr (i64 ptrtoint (i32 addrspace(2)* @l2 to i64) to i32*)
-;
+; CHECK: ret i32* inttoptr (i64 ptrtoint (i32 addrspace(2)* @l2 to i64) to i32*)
   %1 = alloca %union.anon, align 8
   %2 = bitcast %union.anon* %1 to i32 addrspace(2)**
   store i32 addrspace(2)* @l2, i32 addrspace(2)** %2, align 8
@@ -125,52 +102,30 @@ define i32* @pr27557.alt() {
 
 ; Make sure pre-splitting doesn't try to introduce an illegal bitcast
 define float @presplit(i64 addrspace(1)* %p) {
-; CHECK-LABEL: @presplit(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P_SROA_CAST:%.*]] = bitcast i64 addrspace(1)* [[P:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[L1:%.*]] = load i32, i32 addrspace(1)* [[P_SROA_CAST]], align 4
-; CHECK-NEXT:    [[P_SROA_RAW_CAST:%.*]] = bitcast i64 addrspace(1)* [[P]] to i8 addrspace(1)*
-; CHECK-NEXT:    [[P_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[P_SROA_RAW_CAST]], i16 4
-; CHECK-NEXT:    [[P_SROA_CAST2:%.*]] = bitcast i8 addrspace(1)* [[P_SROA_RAW_IDX]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[L3:%.*]] = load i32, i32 addrspace(1)* [[P_SROA_CAST2]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[L1]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[L3]] to float
-; CHECK-NEXT:    [[RET:%.*]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    ret float [[RET]]
-;
 entry:
-  %b = alloca i64
-  %b.cast = bitcast i64* %b to [2 x float]*
-  %b.gep1 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 0
-  %b.gep2 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 1
-  %l = load i64, i64 addrspace(1)* %p
-  store i64 %l, i64* %b
-  %f1 = load float, float* %b.gep1
-  %f2 = load float, float* %b.gep2
-  %ret = fadd float %f1, %f2
-  ret float %ret
+; CHECK-LABEL: @presplit(
+; CHECK: %[[CAST:.*]] = bitcast i64 addrspace(1)* {{.*}} to i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)* %[[CAST]]
+   %b = alloca i64
+   %b.cast = bitcast i64* %b to [2 x float]*
+   %b.gep1 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 0
+   %b.gep2 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 1
+   %l = load i64, i64 addrspace(1)* %p
+   store i64 %l, i64* %b
+   %f1 = load float, float* %b.gep1
+   %f2 = load float, float* %b.gep2
+   %ret = fadd float %f1, %f2
+   ret float %ret
 }
 
 ; Test load from and store to non-zero address space.
 define void @test_load_store_
diff _addr_space([2 x float] addrspace(1)* %complex1, [2 x float] addrspace(1)* %complex2) {
-; CHECK-LABEL: @test_load_store_
diff _addr_space(
-; CHECK-NEXT:    [[P1_SROA_CAST:%.*]] = bitcast [2 x float] addrspace(1)* [[COMPLEX1:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[V15:%.*]] = load i32, i32 addrspace(1)* [[P1_SROA_CAST]], align 4
-; CHECK-NEXT:    [[P1_SROA_IDX:%.*]] = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* [[COMPLEX1]], i16 0, i16 1
-; CHECK-NEXT:    [[P1_SROA_CAST7:%.*]] = bitcast float addrspace(1)* [[P1_SROA_IDX]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[V18:%.*]] = load i32, i32 addrspace(1)* [[P1_SROA_CAST7]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[V15]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[V18]] to float
-; CHECK-NEXT:    [[SUM:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[SUM]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[SUM]] to i32
-; CHECK-NEXT:    [[P2_SROA_CAST:%.*]] = bitcast [2 x float] addrspace(1)* [[COMPLEX2:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT:    store i32 [[TMP3]], i32 addrspace(1)* [[P2_SROA_CAST]], align 4
-; CHECK-NEXT:    [[P2_SROA_IDX:%.*]] = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* [[COMPLEX2]], i16 0, i16 1
-; CHECK-NEXT:    [[P2_SROA_CAST4:%.*]] = bitcast float addrspace(1)* [[P2_SROA_IDX]] to i32 addrspace(1)*
-; CHECK-NEXT:    store i32 [[TMP4]], i32 addrspace(1)* [[P2_SROA_CAST4]], align 4
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @test_load_store_
diff _addr_space
+; CHECK-NOT: alloca
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
   %a = alloca i64
   %a.cast = bitcast i64* %a to [2 x float]*
   %a.gep1 = getelementptr [2 x float], [2 x float]* %a.cast, i32 0, i32 0

diff  --git a/llvm/test/Transforms/SROA/alignment.ll b/llvm/test/Transforms/SROA/alignment.ll
index de794852ac5f9..0fe9c849e964d 100644
--- a/llvm/test/Transforms/SROA/alignment.ll
+++ b/llvm/test/Transforms/SROA/alignment.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 ; RUN: opt -passes='debugify,function(sroa)' -S < %s | FileCheck %s -check-prefix DEBUGLOC
 
@@ -8,33 +7,15 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i1)
 
 define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[ALLOCA_SROA_0_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX]], align 16
-; CHECK-NEXT:    [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A]], i64 0, i32 1
-; CHECK-NEXT:    [[ALLOCA_SROA_3_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX]], align 1
-; CHECK-NEXT:    [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B:%.*]], i64 0, i32 0
-; CHECK-NEXT:    store i8 [[ALLOCA_SROA_0_0_COPYLOAD]], i8* [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX]], align 16
-; CHECK-NEXT:    [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B]], i64 0, i32 1
-; CHECK-NEXT:    store i8 [[ALLOCA_SROA_3_0_COPYLOAD]], i8* [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX]], align 1
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test1(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata { i8, i8 }* undef, metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG16:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
-; DEBUGLOC-NEXT:    [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A:%.*]], i64 0, i32 0, !dbg [[DBG18:![0-9]+]]
-; DEBUGLOC-NEXT:    [[ALLOCA_SROA_0_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX]], align 16, !dbg [[DBG18]]
-; DEBUGLOC-NEXT:    [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A]], i64 0, i32 1, !dbg [[DBG18]]
-; DEBUGLOC-NEXT:    [[ALLOCA_SROA_3_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX]], align 1, !dbg [[DBG18]]
-; DEBUGLOC-NEXT:    [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B:%.*]], i64 0, i32 0, !dbg [[DBG19:![0-9]+]]
-; DEBUGLOC-NEXT:    store i8 [[ALLOCA_SROA_0_0_COPYLOAD]], i8* [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX]], align 16, !dbg [[DBG19]]
-; DEBUGLOC-NEXT:    [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B]], i64 0, i32 1, !dbg [[DBG19]]
-; DEBUGLOC-NEXT:    store i8 [[ALLOCA_SROA_3_0_COPYLOAD]], i8* [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX]], align 1, !dbg [[DBG19]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG20:![0-9]+]]
-;
+; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %a, i64 0, i32 0
+; CHECK: %[[a0:.*]] = load i8, i8* %[[gep_a0]], align 16
+; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %a, i64 0, i32 1
+; CHECK: %[[a1:.*]] = load i8, i8* %[[gep_a1]], align 1
+; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %b, i64 0, i32 0
+; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
+; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %b, i64 0, i32 1
+; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
+; CHECK: ret void
 
 entry:
   %alloca = alloca { i8, i8 }, align 16
@@ -51,37 +32,18 @@ entry:
 
 define void @test2() {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i16, align 2
-; CHECK-NEXT:    store volatile i16 0, i16* [[A_SROA_0]], align 2
-; CHECK-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_CAST:%.*]] = bitcast i16* [[A_SROA_0]] to i8*
-; CHECK-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST]], i64 1
-; CHECK-NEXT:    [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX]], align 1
-; CHECK-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_CAST3:%.*]] = bitcast i16* [[A_SROA_0]] to i8*
-; CHECK-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_IDX4:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST3]], i64 1
-; CHECK-NEXT:    store i8 42, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX4]], align 1
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test2(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    [[A_SROA_0:%.*]] = alloca i16, align 2, !dbg [[DBG29:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata { i8, i8, i8, i8 }* undef, metadata [[META23:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG30:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i16* undef, metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]]
-; DEBUGLOC-NEXT:    store volatile i16 0, i16* [[A_SROA_0]], align 2, !dbg [[DBG32:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_CAST:%.*]] = bitcast i16* [[A_SROA_0]] to i8*, !dbg [[DBG34:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST]], i64 1, !dbg [[DBG34]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX]], align 1, !dbg [[DBG34]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8 [[A_SROA_0_1_A_SROA_0_2_RESULT]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_CAST3:%.*]] = bitcast i16* [[A_SROA_0]] to i8*, !dbg [[DBG35:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_1_GEP2_SROA_RAW_IDX4:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST3]], i64 1, !dbg [[DBG35]]
-; DEBUGLOC-NEXT:    store i8 42, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX4]], align 1, !dbg [[DBG35]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG36:![0-9]+]]
-;
+; CHECK: alloca i16
+; CHECK: load i8, i8* %{{.*}}
+; CHECK: store i8 42, i8* %{{.*}}
+; CHECK: ret void
 
 ; Check that when sroa rewrites the alloca partition
 ; it preserves the original DebugLocation.
+; DEBUGLOC-LABEL: @test2(
+; DEBUGLOC: {{.*}} = alloca {{.*}} !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC-LABEL: }
+;
+; DEBUGLOC: ![[DbgLoc]] = !DILocation(line: 9,
 
 entry:
   %a = alloca { i8, i8, i8, i8 }, align 2      ; "line 9" to -debugify
@@ -97,23 +59,9 @@ entry:
 define void @PR13920(<2 x i64>* %a, i16* %b) {
 ; Test that alignments on memcpy intrinsics get propagated to loads and stores.
 ; CHECK-LABEL: @PR13920(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64>* [[A:%.*]], align 2
-; CHECK-NEXT:    [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16* [[B:%.*]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64>* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @PR13920(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata <2 x i64>* undef, metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
-; DEBUGLOC-NEXT:    [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64>* [[A:%.*]], align 2, !dbg [[DBG46:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
-; DEBUGLOC-NEXT:    [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16* [[B:%.*]] to <2 x i64>*, !dbg [[DBG48:![0-9]+]]
-; DEBUGLOC-NEXT:    store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64>* [[AA_0_BPTR_SROA_CAST]], align 2, !dbg [[DBG48]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG49:![0-9]+]]
-;
+; CHECK: load <2 x i64>, <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
 
 entry:
   %aa = alloca <2 x i64>, align 16
@@ -131,30 +79,9 @@ define void @test3(i8* %x) {
 ; expecting. However, also check that any offset within an alloca can in turn
 ; reduce the alignment.
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca [22 x i8], align 8
-; CHECK-NEXT:    [[B_SROA_0:%.*]] = alloca [18 x i8], align 2
-; CHECK-NEXT:    [[A_SROA_0_0_A_RAW_SROA_IDX:%.*]] = getelementptr inbounds [22 x i8], [22 x i8]* [[A_SROA_0]], i64 0, i64 0
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[A_SROA_0_0_A_RAW_SROA_IDX]], i8* align 8 [[X:%.*]], i32 22, i1 false)
-; CHECK-NEXT:    [[B_SROA_0_6_B_GEP_SROA_IDX:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* [[B_SROA_0]], i64 0, i64 0
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[B_SROA_0_6_B_GEP_SROA_IDX]], i8* align 2 [[X]], i32 18, i1 false)
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test3(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    [[A_SROA_0:%.*]] = alloca [22 x i8], align 8, !dbg [[DBG57:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata { i8*, i8*, i8* }* undef, metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[B_SROA_0:%.*]] = alloca [18 x i8], align 2, !dbg [[DBG58:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata { i8*, i8*, i8* }* undef, metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG59:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_A_RAW_SROA_IDX:%.*]] = getelementptr inbounds [22 x i8], [22 x i8]* [[A_SROA_0]], i64 0, i64 0, !dbg [[DBG60:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[A_SROA_0_0_A_RAW_SROA_IDX]], i8* align 8 [[X:%.*]], i32 22, i1 false), !dbg [[DBG60]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG61:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]]
-; DEBUGLOC-NEXT:    [[B_SROA_0_6_B_GEP_SROA_IDX:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* [[B_SROA_0]], i64 0, i64 0, !dbg [[DBG63:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[B_SROA_0_6_B_GEP_SROA_IDX]], i8* align 2 [[X]], i32 18, i1 false), !dbg [[DBG63]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG64:![0-9]+]]
-;
+; CHECK: alloca [22 x i8], align 8
+; CHECK: alloca [18 x i8], align 2
+; CHECK: ret void
 
 entry:
   %a = alloca { i8*, i8*, i8* }
@@ -173,53 +100,14 @@ define void @test5() {
 ; split or promoted out of existence.
 ;
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    [[A_SROA_3:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    [[A_SROA_0_0_PTR1_SROA_CAST2:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*
-; CHECK-NEXT:    store volatile double 0.000000e+00, double* [[A_SROA_0_0_PTR1_SROA_CAST2]], align 1
-; CHECK-NEXT:    [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_0]], i64 0, i64 7
-; CHECK-NEXT:    [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5:%.*]] = bitcast i8* [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4]] to i16*
-; CHECK-NEXT:    [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, i16* [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5]], align 1
-; CHECK-NEXT:    [[A_SROA_0_0_PTR1_SROA_CAST3:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*
-; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_D1:%.*]] = load double, double* [[A_SROA_0_0_PTR1_SROA_CAST3]], align 1
-; CHECK-NEXT:    [[A_SROA_3_0_PTR2_SROA_CAST:%.*]] = bitcast [9 x i8]* [[A_SROA_3]] to double*
-; CHECK-NEXT:    store volatile double [[A_SROA_0_0_A_SROA_0_0_D1]], double* [[A_SROA_3_0_PTR2_SROA_CAST]], align 1
-; CHECK-NEXT:    [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_3]], i64 0, i64 7
-; CHECK-NEXT:    [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX]] to i16*
-; CHECK-NEXT:    [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, i16* [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST]], align 1
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test5(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    [[A_SROA_0:%.*]] = alloca [9 x i8], align 1, !dbg [[DBG80:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_3:%.*]] = alloca [9 x i8], align 1, !dbg [[DBG80]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [18 x i8]* undef, metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG80]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double* undef, metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_PTR1_SROA_CAST2:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*, !dbg [[DBG83:![0-9]+]]
-; DEBUGLOC-NEXT:    store volatile double 0.000000e+00, double* [[A_SROA_0_0_PTR1_SROA_CAST2]], align 1, !dbg [[DBG83]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i16* undef, metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG85:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_0]], i64 0, i64 7, !dbg [[DBG86:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5:%.*]] = bitcast i8* [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4]] to i16*, !dbg [[DBG86]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, i16* [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5]], align 1, !dbg [[DBG86]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i16 [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double* undef, metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG88:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_PTR1_SROA_CAST3:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*, !dbg [[DBG89:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_A_SROA_0_0_D1:%.*]] = load double, double* [[A_SROA_0_0_PTR1_SROA_CAST3]], align 1, !dbg [[DBG89]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double [[A_SROA_0_0_A_SROA_0_0_D1]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89]]
-; DEBUGLOC-NEXT:    [[A_SROA_3_0_PTR2_SROA_CAST:%.*]] = bitcast [9 x i8]* [[A_SROA_3]] to double*, !dbg [[DBG90:![0-9]+]]
-; DEBUGLOC-NEXT:    store volatile double [[A_SROA_0_0_A_SROA_0_0_D1]], double* [[A_SROA_3_0_PTR2_SROA_CAST]], align 1, !dbg [[DBG90]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i16* undef, metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_3]], i64 0, i64 7, !dbg [[DBG93:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX]] to i16*, !dbg [[DBG93]]
-; DEBUGLOC-NEXT:    [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, i16* [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST]], align 1, !dbg [[DBG93]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i16 [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG94:![0-9]+]]
-;
+; CHECK: alloca [9 x i8]
+; CHECK: alloca [9 x i8]
+; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
+; CHECK: load volatile i16, i16* %{{.*}}, align 1
+; CHECK: load double, double* %{{.*}}, align 1
+; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
+; CHECK: load volatile i16, i16* %{{.*}}, align 1
+; CHECK: ret void
 
 entry:
   %a = alloca [18 x i8]
@@ -245,29 +133,13 @@ define void @test6() {
 ; We should set the alignment on all load and store operations; make sure
 ; we choose an appropriate alignment.
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[A_SROA_2:%.*]] = alloca double, align 8
-; CHECK-NEXT:    store volatile double 0.000000e+00, double* [[A_SROA_0]], align 8
-; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load double, double* [[A_SROA_0]], align 8
-; CHECK-NEXT:    store volatile double [[A_SROA_0_0_A_SROA_0_0_VAL]], double* [[A_SROA_2]], align 8
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test6(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    [[A_SROA_0:%.*]] = alloca double, align 8, !dbg [[DBG103:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_2:%.*]] = alloca double, align 8, !dbg [[DBG103]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [16 x i8]* undef, metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG103]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG104:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double* undef, metadata [[META99:![0-9]+]], metadata !DIExpression()), !dbg [[DBG105:![0-9]+]]
-; DEBUGLOC-NEXT:    store volatile double 0.000000e+00, double* [[A_SROA_0]], align 8, !dbg [[DBG106:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double* undef, metadata [[META101:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load double, double* [[A_SROA_0]], align 8, !dbg [[DBG109:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double [[A_SROA_0_0_A_SROA_0_0_VAL]], metadata [[META102:![0-9]+]], metadata !DIExpression()), !dbg [[DBG109]]
-; DEBUGLOC-NEXT:    store volatile double [[A_SROA_0_0_A_SROA_0_0_VAL]], double* [[A_SROA_2]], align 8, !dbg [[DBG110:![0-9]+]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG111:![0-9]+]]
-;
+; CHECK: alloca double, align 8{{$}}
+; CHECK: alloca double, align 8{{$}}
+; CHECK: store{{.*}}, align 8
+; CHECK: load{{.*}}, align 8
+; CHECK: store{{.*}}, align 8
+; CHECK-NOT: align
+; CHECK: ret void
 
 entry:
   %a = alloca [16 x i8]
@@ -287,40 +159,7 @@ define void @test7(i8* %out) {
 ; Test that we properly compute the destination alignment when rewriting
 ; memcpys as direct loads or stores.
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[OUT:%.*]] to double*
-; CHECK-NEXT:    [[A_SROA_0_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_0_0_OUT_SROA_CAST]], align 1
-; CHECK-NEXT:    [[A_SROA_4_0_OUT_SROA_IDX:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8
-; CHECK-NEXT:    [[A_SROA_4_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX]] to double*
-; CHECK-NEXT:    [[A_SROA_4_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_4_0_OUT_SROA_CAST]], align 1
-; CHECK-NEXT:    [[A_SROA_0_0_OUT_SROA_CAST1:%.*]] = bitcast i8* [[OUT]] to double*
-; CHECK-NEXT:    store double [[A_SROA_4_0_COPYLOAD]], double* [[A_SROA_0_0_OUT_SROA_CAST1]], align 1
-; CHECK-NEXT:    [[A_SROA_4_0_OUT_SROA_IDX3:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8
-; CHECK-NEXT:    [[A_SROA_4_0_OUT_SROA_CAST4:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX3]] to double*
-; CHECK-NEXT:    store double [[A_SROA_0_0_COPYLOAD]], double* [[A_SROA_4_0_OUT_SROA_CAST4]], align 1
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test7(
-; DEBUGLOC-NEXT:  entry:
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [16 x i8]* undef, metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double* undef, metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG123:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* undef, metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG124:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double* undef, metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[OUT:%.*]] to double*, !dbg [[DBG126:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_0_0_OUT_SROA_CAST]], align 1, !dbg [[DBG126]]
-; DEBUGLOC-NEXT:    [[A_SROA_4_0_OUT_SROA_IDX:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8, !dbg [[DBG126]]
-; DEBUGLOC-NEXT:    [[A_SROA_4_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX]] to double*, !dbg [[DBG126]]
-; DEBUGLOC-NEXT:    [[A_SROA_4_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_4_0_OUT_SROA_CAST]], align 1, !dbg [[DBG126]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double [[A_SROA_4_0_COPYLOAD]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata double [[A_SROA_0_0_COPYLOAD]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
-; DEBUGLOC-NEXT:    [[A_SROA_0_0_OUT_SROA_CAST1:%.*]] = bitcast i8* [[OUT]] to double*, !dbg [[DBG129:![0-9]+]]
-; DEBUGLOC-NEXT:    store double [[A_SROA_4_0_COPYLOAD]], double* [[A_SROA_0_0_OUT_SROA_CAST1]], align 1, !dbg [[DBG129]]
-; DEBUGLOC-NEXT:    [[A_SROA_4_0_OUT_SROA_IDX3:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8, !dbg [[DBG129]]
-; DEBUGLOC-NEXT:    [[A_SROA_4_0_OUT_SROA_CAST4:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX3]] to double*, !dbg [[DBG129]]
-; DEBUGLOC-NEXT:    store double [[A_SROA_0_0_COPYLOAD]], double* [[A_SROA_4_0_OUT_SROA_CAST4]], align 1, !dbg [[DBG129]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG130:![0-9]+]]
-;
+; CHECK-NOT: alloca
 
 entry:
   %a = alloca [16 x i8]
@@ -330,6 +169,8 @@ entry:
   %ptr2 = bitcast i8* %raw2 to double*
 
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i1 false)
+; CHECK: %[[val2:.*]] = load double, double* %{{.*}}, align 1
+; CHECK: %[[val1:.*]] = load double, double* %{{.*}}, align 1
 
   %val1 = load double, double* %ptr2, align 1
   %val2 = load double, double* %ptr1, align 1
@@ -338,56 +179,20 @@ entry:
   store double %val2, double* %ptr2, align 1
 
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i1 false)
+; CHECK: store double %[[val1]], double* %{{.*}}, align 1
+; CHECK: store double %[[val2]], double* %{{.*}}, align 1
 
   ret void
+; CHECK: ret void
 }
 
 define void @test8() {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[PTR:%.*]] = alloca [5 x i32], align 1
-; CHECK-NEXT:    [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*
-; CHECK-NEXT:    call void @populate(i8* [[PTR_8]])
-; CHECK-NEXT:    [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0
-; CHECK-NEXT:    [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 1
-; CHECK-NEXT:    [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0
-; CHECK-NEXT:    [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1
-; CHECK-NEXT:    [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 1
-; CHECK-NEXT:    [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1
-; CHECK-NEXT:    [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2
-; CHECK-NEXT:    [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 1
-; CHECK-NEXT:    [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2
-; CHECK-NEXT:    [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3
-; CHECK-NEXT:    [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 1
-; CHECK-NEXT:    [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3
-; CHECK-NEXT:    [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4
-; CHECK-NEXT:    [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 1
-; CHECK-NEXT:    [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test8(
-; DEBUGLOC-NEXT:    [[PTR:%.*]] = alloca [5 x i32], align 1, !dbg [[DBG137:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [5 x i32]* [[PTR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137]]
-; DEBUGLOC-NEXT:    [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*, !dbg [[DBG138:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* [[PTR_8]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG138]]
-; DEBUGLOC-NEXT:    call void @populate(i8* [[PTR_8]]), !dbg [[DBG139:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0, !dbg [[DBG140:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4, !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [5 x i32] [[VAL_FCA_4_INSERT]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG140]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG141:![0-9]+]]
-;
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
 
   %ptr = alloca [5 x i32], align 1
   %ptr.8 = bitcast [5 x i32]* %ptr to i8*
@@ -398,50 +203,11 @@ define void @test8() {
 
 define void @test9() {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[PTR:%.*]] = alloca [5 x i32], align 8
-; CHECK-NEXT:    [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*
-; CHECK-NEXT:    call void @populate(i8* [[PTR_8]])
-; CHECK-NEXT:    [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0
-; CHECK-NEXT:    [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 8
-; CHECK-NEXT:    [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0
-; CHECK-NEXT:    [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1
-; CHECK-NEXT:    [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1
-; CHECK-NEXT:    [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2
-; CHECK-NEXT:    [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 8
-; CHECK-NEXT:    [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2
-; CHECK-NEXT:    [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3
-; CHECK-NEXT:    [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 4
-; CHECK-NEXT:    [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3
-; CHECK-NEXT:    [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4
-; CHECK-NEXT:    [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 8
-; CHECK-NEXT:    [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test9(
-; DEBUGLOC-NEXT:    [[PTR:%.*]] = alloca [5 x i32], align 8, !dbg [[DBG147:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [5 x i32]* [[PTR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147]]
-; DEBUGLOC-NEXT:    [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*, !dbg [[DBG148:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* [[PTR_8]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148]]
-; DEBUGLOC-NEXT:    call void @populate(i8* [[PTR_8]]), !dbg [[DBG149:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0, !dbg [[DBG150:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 8, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 8, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 8, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata [5 x i32] [[VAL_FCA_4_INSERT]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG151:![0-9]+]]
-;
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
 
   %ptr = alloca [5 x i32], align 8
   %ptr.8 = bitcast [5 x i32]* %ptr to i8*
@@ -452,50 +218,11 @@ define void @test9() {
 
 define void @test10() {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[PTR:%.*]] = alloca { i32, i8, i8, { i8, i16 } }, align 2
-; CHECK-NEXT:    [[PTR_8:%.*]] = bitcast { i32, i8, i8, { i8, i16 } }* [[PTR]] to i8*
-; CHECK-NEXT:    call void @populate(i8* [[PTR_8]])
-; CHECK-NEXT:    [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 0
-; CHECK-NEXT:    [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 2
-; CHECK-NEXT:    [[VAL_FCA_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } poison, i32 [[VAL_FCA_0_LOAD]], 0
-; CHECK-NEXT:    [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 1
-; CHECK-NEXT:    [[VAL_FCA_1_LOAD:%.*]] = load i8, i8* [[VAL_FCA_1_GEP]], align 2
-; CHECK-NEXT:    [[VAL_FCA_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_0_INSERT]], i8 [[VAL_FCA_1_LOAD]], 1
-; CHECK-NEXT:    [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 2
-; CHECK-NEXT:    [[VAL_FCA_2_LOAD:%.*]] = load i8, i8* [[VAL_FCA_2_GEP]], align 1
-; CHECK-NEXT:    [[VAL_FCA_2_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_1_INSERT]], i8 [[VAL_FCA_2_LOAD]], 2
-; CHECK-NEXT:    [[VAL_FCA_3_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 0
-; CHECK-NEXT:    [[VAL_FCA_3_0_LOAD:%.*]] = load i8, i8* [[VAL_FCA_3_0_GEP]], align 2
-; CHECK-NEXT:    [[VAL_FCA_3_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_2_INSERT]], i8 [[VAL_FCA_3_0_LOAD]], 3, 0
-; CHECK-NEXT:    [[VAL_FCA_3_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 1
-; CHECK-NEXT:    [[VAL_FCA_3_1_LOAD:%.*]] = load i16, i16* [[VAL_FCA_3_1_GEP]], align 2
-; CHECK-NEXT:    [[VAL_FCA_3_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_3_0_INSERT]], i16 [[VAL_FCA_3_1_LOAD]], 3, 1
-; CHECK-NEXT:    ret void
-;
-; DEBUGLOC-LABEL: @test10(
-; DEBUGLOC-NEXT:    [[PTR:%.*]] = alloca { i32, i8, i8, { i8, i16 } }, align 2, !dbg [[DBG158:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata { i32, i8, i8, { i8, i16 } }* [[PTR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158]]
-; DEBUGLOC-NEXT:    [[PTR_8:%.*]] = bitcast { i32, i8, i8, { i8, i16 } }* [[PTR]] to i8*, !dbg [[DBG159:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i8* [[PTR_8]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG159]]
-; DEBUGLOC-NEXT:    call void @populate(i8* [[PTR_8]]), !dbg [[DBG160:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 0, !dbg [[DBG161:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } poison, i32 [[VAL_FCA_0_LOAD]], 0, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_LOAD:%.*]] = load i8, i8* [[VAL_FCA_1_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_0_INSERT]], i8 [[VAL_FCA_1_LOAD]], 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_LOAD:%.*]] = load i8, i8* [[VAL_FCA_2_GEP]], align 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_2_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_1_INSERT]], i8 [[VAL_FCA_2_LOAD]], 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 0, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_0_LOAD:%.*]] = load i8, i8* [[VAL_FCA_3_0_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_2_INSERT]], i8 [[VAL_FCA_3_0_LOAD]], 3, 0, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_1_LOAD:%.*]] = load i16, i16* [[VAL_FCA_3_1_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    [[VAL_FCA_3_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_3_0_INSERT]], i16 [[VAL_FCA_3_1_LOAD]], 3, 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata { i32, i8, i8, { i8, i16 } } [[VAL_FCA_3_1_INSERT]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG161]]
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG162:![0-9]+]]
-;
+; CHECK: load i32, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 1
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i16, {{.*}}, align 2
 
   %ptr = alloca {i32, i8, i8, {i8, i16}}, align 2
   %ptr.8 = bitcast {i32, i8, i8, {i8, i16}}* %ptr to i8*
@@ -506,28 +233,8 @@ define void @test10() {
 
 %struct = type { i32, i32 }
 define dso_local i32 @pr45010(%struct* %A) {
-; CHECK-LABEL: @pr45010(
-; CHECK-NEXT:    [[B_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_I]], align 4
-; CHECK-NEXT:    store atomic volatile i32 [[TMP1]], i32* [[B_SROA_0]] release, align 4
-; CHECK-NEXT:    [[B_SROA_0_0_B_SROA_0_0_X:%.*]] = load atomic volatile i32, i32* [[B_SROA_0]] acquire, align 4
-; CHECK-NEXT:    ret i32 [[B_SROA_0_0_B_SROA_0_0_X]]
-;
-; DEBUGLOC-LABEL: @pr45010(
-; DEBUGLOC-NEXT:    [[B_SROA_0:%.*]] = alloca i32, align 4, !dbg [[DBG172:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata %struct* undef, metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-; DEBUGLOC-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i32 0, i32 0, !dbg [[DBG173:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32* [[A_I]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32* undef, metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG174:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_I]], align 4, !dbg [[DBG175:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP1]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175]]
-; DEBUGLOC-NEXT:    store atomic volatile i32 [[TMP1]], i32* [[B_SROA_0]] release, align 4, !dbg [[DBG176:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32* undef, metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
-; DEBUGLOC-NEXT:    [[B_SROA_0_0_B_SROA_0_0_X:%.*]] = load atomic volatile i32, i32* [[B_SROA_0]] acquire, align 4, !dbg [[DBG178:![0-9]+]]
-; DEBUGLOC-NEXT:    call void @llvm.dbg.value(metadata i32 [[B_SROA_0_0_B_SROA_0_0_X]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG178]]
-; DEBUGLOC-NEXT:    ret i32 [[B_SROA_0_0_B_SROA_0_0_X]], !dbg [[DBG179:![0-9]+]]
-;
+; CHECK-LABEL: @pr45010
+; CHECK: load atomic volatile i32, {{.*}}, align 4
 
   %B = alloca %struct, align 4
   %A.i = getelementptr inbounds %struct, %struct* %A, i32 0, i32 0

diff  --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll
index 7538c311aa518..0853f9e9ed4ae 100644
--- a/llvm/test/Transforms/SROA/big-endian.ll
+++ b/llvm/test/Transforms/SROA/big-endian.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
@@ -10,31 +9,11 @@ define i8 @test1() {
 ; ordering.
 ;
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i8 0 to i24
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_MASK:%.*]] = and i24 undef, -256
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_EXT]]
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i8 0 to i24
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_2_0_INSERT_EXT]], 8
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_3_0_INSERT_INSERT]], -65281
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_2_0_INSERT_MASK]], [[A_SROA_2_0_INSERT_SHIFT]]
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i8 0 to i24
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_0_0_INSERT_EXT]], 16
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_2_0_INSERT_INSERT]], 65535
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT:    [[B_SROA_0_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 16
-; CHECK-NEXT:    [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_0_0_EXTRACT_SHIFT]] to i8
-; CHECK-NEXT:    [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 8
-; CHECK-NEXT:    [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8
-; CHECK-NEXT:    [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[A_SROA_0_0_INSERT_INSERT]] to i8
-; CHECK-NEXT:    [[BSUM0:%.*]] = add i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[B_SROA_2_0_EXTRACT_TRUNC]]
-; CHECK-NEXT:    [[BSUM1:%.*]] = add i8 [[BSUM0]], [[B_SROA_3_0_EXTRACT_TRUNC]]
-; CHECK-NEXT:    ret i8 [[BSUM1]]
-;
 
 entry:
   %a = alloca [3 x i8]
   %b = alloca [3 x i8]
+; CHECK-NOT: alloca
 
   %a0ptr = getelementptr [3 x i8], [3 x i8]* %a, i64 0, i32 0
   store i8 0, i8* %a0ptr
@@ -44,6 +23,19 @@ entry:
   store i8 0, i8* %a2ptr
   %aiptr = bitcast [3 x i8]* %a to i24*
   %ai = load i24, i24* %aiptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -53,10 +45,20 @@ entry:
   %b1 = load i8, i8* %b1ptr
   %b2ptr = getelementptr [3 x i8], [3 x i8]* %b, i64 0, i32 2
   %b2 = load i8, i8* %b2ptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
 
   %bsum0 = add i8 %b0, %b1
   %bsum1 = add i8 %bsum0, %b2
   ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
 }
 
 define i64 @test2() {
@@ -64,37 +66,18 @@ define i64 @test2() {
 ; promoted.
 ;
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_2_SROA_4_0_INSERT_EXT:%.*]] = zext i8 1 to i40
-; CHECK-NEXT:    [[A_SROA_2_SROA_4_0_INSERT_MASK:%.*]] = and i40 undef, -256
-; CHECK-NEXT:    [[A_SROA_2_SROA_4_0_INSERT_INSERT:%.*]] = or i40 [[A_SROA_2_SROA_4_0_INSERT_MASK]], [[A_SROA_2_SROA_4_0_INSERT_EXT]]
-; CHECK-NEXT:    [[A_SROA_2_SROA_3_0_INSERT_EXT:%.*]] = zext i24 0 to i40
-; CHECK-NEXT:    [[A_SROA_2_SROA_3_0_INSERT_SHIFT:%.*]] = shl i40 [[A_SROA_2_SROA_3_0_INSERT_EXT]], 8
-; CHECK-NEXT:    [[A_SROA_2_SROA_3_0_INSERT_MASK:%.*]] = and i40 [[A_SROA_2_SROA_4_0_INSERT_INSERT]], -4294967041
-; CHECK-NEXT:    [[A_SROA_2_SROA_3_0_INSERT_INSERT:%.*]] = or i40 [[A_SROA_2_SROA_3_0_INSERT_MASK]], [[A_SROA_2_SROA_3_0_INSERT_SHIFT]]
-; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_INSERT_EXT:%.*]] = zext i8 0 to i40
-; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_INSERT_SHIFT:%.*]] = shl i40 [[A_SROA_2_SROA_0_0_INSERT_EXT]], 32
-; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_INSERT_MASK:%.*]] = and i40 [[A_SROA_2_SROA_3_0_INSERT_INSERT]], 4294967295
-; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_INSERT_INSERT:%.*]] = or i40 [[A_SROA_2_SROA_0_0_INSERT_MASK]], [[A_SROA_2_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i40 [[A_SROA_2_SROA_0_0_INSERT_INSERT]] to i56
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_MASK:%.*]] = and i56 undef, -1099511627776
-; CHECK-NEXT:    [[A_SROA_2_0_INSERT_INSERT:%.*]] = or i56 [[A_SROA_2_0_INSERT_MASK]], [[A_SROA_2_0_INSERT_EXT]]
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i16 1 to i56
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i56 [[A_SROA_0_0_INSERT_EXT]], 40
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_MASK:%.*]] = and i56 [[A_SROA_2_0_INSERT_INSERT]], 1099511627775
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i56 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT:    [[RET:%.*]] = zext i56 [[A_SROA_0_0_INSERT_INSERT]] to i64
-; CHECK-NEXT:    ret i64 [[RET]]
-;
 
 entry:
   %a = alloca [7 x i8]
+; CHECK-NOT: alloca
 
   %a0ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 0
   %a1ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 1
   %a2ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 2
   %a3ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 3
 
+; CHECK-NOT: store
+; CHECK-NOT: load
 
   %a0i16ptr = bitcast i8* %a0ptr to i16*
   store i16 1, i16* %a0i16ptr
@@ -109,32 +92,44 @@ entry:
 
 ; the alloca is splitted into multiple slices
 ; Here, i8 1 is for %a[6]
+; CHECK: %[[ext1:.*]] = zext i8 1 to i40
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, -256
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], %[[ext1]]
 
 ; Here, i24 0 is for %a[3] to %a[5]
+; CHECK-NEXT: %[[ext2:.*]] = zext i24 0 to i40
+; CHECK-NEXT: %[[shift2:.*]] = shl i40 %[[ext2]], 8
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], %[[shift2]]
 
 ; Here, i8 0 is for %a[2]
+; CHECK-NEXT: %[[ext3:.*]] = zext i8 0 to i40
+; CHECK-NEXT: %[[shift3:.*]] = shl i40 %[[ext3]], 32
+; CHECK-NEXT: %[[mask3:.*]] = and i40 %[[insert2]], 4294967295
+; CHECK-NEXT: %[[insert3:.*]] = or i40 %[[mask3]], %[[shift3]]
 
+; CHECK-NEXT: %[[ext4:.*]] = zext i40 %[[insert3]] to i56
+; CHECK-NEXT: %[[mask4:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[ext4]]
 
+; CHECK-NOT: store
+; CHECK-NOT: load
 
   %aiptr = bitcast [7 x i8]* %a to i56*
   %ai = load i56, i56* %aiptr
   %ret = zext i56 %ai to i64
   ret i64 %ret
 ; Here, i16 1 is for %a[0] to %a[1]
+; CHECK-NEXT: %[[ext5:.*]] = zext i16 1 to i56
+; CHECK-NEXT: %[[shift5:.*]] = shl i56 %[[ext5]], 40
+; CHECK-NEXT: %[[mask5:.*]] = and i56 %[[insert4]], 1099511627775
+; CHECK-NEXT: %[[insert5:.*]] = or i56 %[[mask5]], %[[shift5]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert5]] to i64
+; CHECK-NEXT: ret i64 %[[ret]]
 }
 
 define i64 @PR14132(i1 %flag) {
 ; CHECK-LABEL: @PR14132(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[FLAG:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[B_0_LOAD_EXT:%.*]] = zext i8 1 to i64
-; CHECK-NEXT:    [[B_0_ENDIAN_SHIFT:%.*]] = shl i64 [[B_0_LOAD_EXT]], 56
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[PTR_0_SROA_SPECULATED:%.*]] = phi i64 [ [[B_0_ENDIAN_SHIFT]], [[IF_THEN]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i64 [[PTR_0_SROA_SPECULATED]]
-;
 ; Here we form a PHI-node by promoting the pointer alloca first, and then in
 ; order to promote the other two allocas, we speculate the load of the
 ; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
@@ -147,6 +142,7 @@ entry:
   %a = alloca i64, align 8
   %b = alloca i8, align 8
   %ptr = alloca i64*, align 8
+; CHECK-NOT: alloca
 
   %ptr.cast = bitcast i64** %ptr to i8**
   store i64 0, i64* %a
@@ -157,28 +153,24 @@ entry:
 if.then:
   store i8* %b, i8** %ptr.cast
   br label %if.end
+; CHECK-NOT: store
+; CHECK: %[[ext:.*]] = zext i8 1 to i64
+; CHECK: %[[shift:.*]] = shl i64 %[[ext]], 56
 
 if.end:
   %tmp = load i64*, i64** %ptr
   %result = load i64, i64* %tmp
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i64 [ %[[shift]], %if.then ], [ 0, %entry ]
 
   ret i64 %result
+; CHECK-NEXT: ret i64 %[[result]]
 }
 
 declare void @f(i64 %x, i32 %y)
 
 define void @test3() {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i32 134316040 to i64
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_MASK:%.*]] = and i64 undef, -4294967296
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_EXT]]
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 8 to i64
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i64 [[A_SROA_0_0_INSERT_EXT]], 32
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_MASK:%.*]] = and i64 [[A_SROA_3_0_INSERT_INSERT]], 4294967295
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT:    call void @f(i64 [[A_SROA_0_0_INSERT_INSERT]], i32 8)
-; CHECK-NEXT:    ret void
 ;
 ; This is a test that specifically exercises the big-endian lowering because it
 ; ends up splitting a 64-bit integer into two smaller integers and has a number
@@ -186,61 +178,75 @@ define void @test3() {
 ; would miscompile this by either dropping a most significant byte or least
 ; significant byte due to shrinking the [4,8) slice to an i24, or by failing to
 ; move the bytes around correctly.
+;
 ; The magical number 34494054408 is used because it has bits set in various
 ; bytes so that it is clear if those bytes fail to be propagated.
+;
 ; If you're debugging this, rather than using the direct magical numbers, run
 ; the IR through '-sroa -instcombine'. With '-instcombine' these will be
 ; constant folded, and if the i64 doesn't round-trip correctly, you've found
 ; a bug!
+;
 entry:
   %a = alloca { i32, i24 }, align 4
+; CHECK-NOT: alloca
 
   %tmp0 = bitcast { i32, i24 }* %a to i64*
   store i64 34494054408, i64* %tmp0
   %tmp1 = load i64, i64* %tmp0, align 4
   %tmp2 = bitcast { i32, i24 }* %a to i32*
   %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: %[[HI_EXT:.*]] = zext i32 134316040 to i64
+; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296
+; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]]
+; CHECK: %[[LO_EXT:.*]] = zext i32 8 to i64
+; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32
+; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295
+; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]]
 
   call void @f(i64 %tmp1, i32 %tmp3)
+; CHECK: call void @f(i64 %[[LO_MERGE]], i32 8)
   ret void
+; CHECK: ret void
 }
 
 define void @test4() {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0_0_EXTRACT_SHIFT:%.*]] = lshr i64 34494054408, 32
-; CHECK-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_0_0_EXTRACT_SHIFT]] to i32
-; CHECK-NEXT:    [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 34494054408 to i32
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i32 [[A_SROA_3_0_EXTRACT_TRUNC]] to i64
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_MASK:%.*]] = and i64 undef, -4294967296
-; CHECK-NEXT:    [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_EXT]]
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[A_SROA_0_0_EXTRACT_TRUNC]] to i64
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i64 [[A_SROA_0_0_INSERT_EXT]], 32
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_MASK:%.*]] = and i64 [[A_SROA_3_0_INSERT_INSERT]], 4294967295
-; CHECK-NEXT:    [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT:    call void @f(i64 [[A_SROA_0_0_INSERT_INSERT]], i32 [[A_SROA_0_0_EXTRACT_TRUNC]])
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: @test4
 ;
 ; Much like @test3, this is specifically testing big-endian management of data.
 ; Also similarly, it uses constants with particular bits set to help track
 ; whether values are corrupted, and can be easily evaluated by running through
 ; -instcombine to see that the i64 round-trips.
+;
 entry:
   %a = alloca { i32, i24 }, align 4
   %a2 = alloca i64, align 4
+; CHECK-NOT: alloca
 
   store i64 34494054408, i64* %a2
   %tmp0 = bitcast { i32, i24 }* %a to i8*
   %tmp1 = bitcast i64* %a2 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %tmp0, i8* align 4 %tmp1, i64 8, i1 false)
+; CHECK: %[[LO_SHR:.*]] = lshr i64 34494054408, 32
+; CHECK: %[[LO_START:.*]] = trunc i64 %[[LO_SHR]] to i32
+; CHECK: %[[HI_START:.*]] = trunc i64 34494054408 to i32
 
   %tmp2 = bitcast { i32, i24 }* %a to i64*
   %tmp3 = load i64, i64* %tmp2, align 4
   %tmp4 = bitcast { i32, i24 }* %a to i32*
   %tmp5 = load i32, i32* %tmp4, align 4
+; CHECK: %[[HI_EXT:.*]] = zext i32 %[[HI_START]] to i64
+; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296
+; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]]
+; CHECK: %[[LO_EXT:.*]] = zext i32 %[[LO_START]] to i64
+; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32
+; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295
+; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]]
 
   call void @f(i64 %tmp3, i32 %tmp5)
+; CHECK: call void @f(i64 %[[LO_MERGE]], i32 %[[LO_START]])
   ret void
+; CHECK: ret void
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1)

diff  --git a/llvm/test/Transforms/SROA/dbg-inline.ll b/llvm/test/Transforms/SROA/dbg-inline.ll
index 27b5d68961a23..b3b3660f64144 100644
--- a/llvm/test/Transforms/SROA/dbg-inline.ll
+++ b/llvm/test/Transforms/SROA/dbg-inline.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test that SROA can deal with allocas that have more than one
 ; dbg.declare hanging off of it.
 
@@ -11,20 +10,16 @@ target triple = "x86_64-apple-macosx10.15.0"
 
 ; Function Attrs: noinline optnone ssp uwtable
 define i64 @_Z1g4pair(i64 %p.coerce0, i64 %p.coerce1) #0 !dbg !8 {
-; CHECK-LABEL: @_Z1g4pair(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
-; CHECK-NEXT:    ret i64 [[P_COERCE0]], !dbg [[DBG22:![0-9]+]]
-;
 entry:
   %p = alloca %struct.pair, align 8
   %0 = getelementptr inbounds %struct.pair, %struct.pair* %p, i32 0, i32 0
   store i64 %p.coerce0, i64* %0, align 8
   %1 = getelementptr inbounds %struct.pair, %struct.pair* %p, i32 0, i32 1
   store i64 %p.coerce1, i64* %1, align 8
+  ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce0, metadata ![[VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg ![[LOC:[0-9]+]]
+  ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce1, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg ![[LOC]]
+  ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce0, metadata ![[INLINED_VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg ![[INLINED_LOC:[0-9]+]]
+  ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce1, metadata ![[INLINED_VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg ![[INLINED_LOC]]
   call void @llvm.dbg.declare(metadata %struct.pair* %p, metadata !17, metadata !DIExpression()), !dbg !18
   call void @llvm.dbg.declare(metadata %struct.pair* %p, metadata !21, metadata !DIExpression()), !dbg !23
   %a.i = getelementptr inbounds %struct.pair, %struct.pair* %p, i32 0, i32 0, !dbg !25
@@ -62,6 +57,7 @@ attributes #2 = { argmemonly nounwind willreturn }
 !15 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !13, file: !9, line: 1, baseType: !12, size: 64)
 !16 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !13, file: !9, line: 1, baseType: !12, size: 64, offset: 64)
 !17 = !DILocalVariable(name: "p", arg: 1, scope: !8, file: !9, line: 9, type: !13)
+; CHECK: ![[LOC]] = !DILocation
 ; CHECK-NOT: inlinedAt
 ; CHECK: =
 !18 = !DILocation(line: 9, column: 27, scope: !8)
@@ -69,6 +65,7 @@ attributes #2 = { argmemonly nounwind willreturn }
 !20 = !DILocation(line: 10, column: 10, scope: !8)
 !21 = !DILocalVariable(name: "p", arg: 1, scope: !22, file: !9, line: 5, type: !13)
 !22 = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: !9, file: !9, line: 5, type: !10, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: ![[INLINED_LOC]] = !DILocation({{.*}}inlinedAt
 !23 = !DILocation(line: 5, column: 27, scope: !22, inlinedAt: !24)
 !24 = distinct !DILocation(line: 10, column: 10, scope: !8)
 !25 = !DILocation(line: 6, column: 12, scope: !22, inlinedAt: !24)

diff  --git a/llvm/test/Transforms/SROA/dbg-single-piece.ll b/llvm/test/Transforms/SROA/dbg-single-piece.ll
index 55aa3070aabc0..d9eb41b347725 100644
--- a/llvm/test/Transforms/SROA/dbg-single-piece.ll
+++ b/llvm/test/Transforms/SROA/dbg-single-piece.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=sroa %s -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -6,16 +5,14 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 define void @_ZL18findInsertLocationPN4llvm17MachineBasicBlockENS_9SlotIndexERNS_13LiveIntervalsE() {
-; CHECK-LABEL: @_ZL18findInsertLocationPN4llvm17MachineBasicBlockENS_9SlotIndexERNS_13LiveIntervalsE(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata %foo* undef, metadata [[META3:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
 entry:
   %retval = alloca %foo, align 8
   call void @llvm.dbg.declare(metadata %foo* %retval, metadata !1, metadata !7), !dbg !8
 ; Checks that SROA still inserts a bit_piece expression, even if it produces only one piece
 ; (as long as that piece is smaller than the whole thing)
+; CHECK-NOT: call void @llvm.dbg.value
+; CHECK: call void @llvm.dbg.value(metadata %foo* undef, {{.*}}, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg
+; CHECK-NOT: call void @llvm.dbg.value
   %0 = bitcast %foo* %retval to i8*
   %1 = getelementptr inbounds i8, i8* %0, i64 8
   %2 = bitcast i8* %1 to %foo**

diff  --git a/llvm/test/Transforms/SROA/dead-inst.ll b/llvm/test/Transforms/SROA/dead-inst.ll
index fe320c790b39e..083c8a6221e1e 100644
--- a/llvm/test/Transforms/SROA/dead-inst.ll
+++ b/llvm/test/Transforms/SROA/dead-inst.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; SROA fails to rewrite allocs but does rewrite some phis and delete
 ; dead instructions. Ensure that this invalidates analyses required
 ; for other passes.
@@ -24,8 +23,8 @@ define hidden fastcc void @H(%class.b* noalias nocapture readnone, [2 x i64]) un
   store i64 0, i64* %.sroa.0, align 8
   %4 = extractvalue [2 x i64] %1, 1
   switch i64 %4, label %6 [
-  i64 4, label %foo
-  i64 5, label %5
+    i64 4, label %foo
+    i64 5, label %5
   ]
 
 ; <label>:5:

diff  --git a/llvm/test/Transforms/SROA/fca.ll b/llvm/test/Transforms/SROA/fca.ll
index 5174751c1682c..19be9e79a7cd4 100644
--- a/llvm/test/Transforms/SROA/fca.ll
+++ b/llvm/test/Transforms/SROA/fca.ll
@@ -1,14 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 define { i32, i32 } @test0(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[RESULT_FCA_0_INSERT:%.*]] = insertvalue { i32, i32 } poison, i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[RESULT_FCA_1_INSERT:%.*]] = insertvalue { i32, i32 } [[RESULT_FCA_0_INSERT]], i32 [[Y:%.*]], 1
-; CHECK-NEXT:    ret { i32, i32 } [[RESULT_FCA_1_INSERT]]
-;
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
 
 entry:
   %a = alloca { i32, i32 }
@@ -29,17 +27,11 @@ define { i32, i32 } @test1(i32 %x, i32 %y) {
 ; split the volatile load and store here but must produce volatile scalar loads
 ; and stores from them.
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
-; CHECK-NEXT:    [[B:%.*]] = alloca { i32, i32 }, align 8
-; CHECK-NEXT:    [[A_0_GEP1_SROA_IDX:%.*]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[A]], i64 0, i32 0
-; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[A_0_GEP1_SROA_IDX]], align 8
-; CHECK-NEXT:    [[A_4_GEP2_SROA_IDX:%.*]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[A]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[Y:%.*]], i32* [[A_4_GEP2_SROA_IDX]], align 4
-; CHECK-NEXT:    [[A_0_RESULT:%.*]] = load volatile { i32, i32 }, { i32, i32 }* [[A]], align 8
-; CHECK-NEXT:    store volatile { i32, i32 } [[A_0_RESULT]], { i32, i32 }* [[B]], align 8
-; CHECK-NEXT:    ret { i32, i32 } [[A_0_RESULT]]
-;
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }, { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
 
 entry:
   %a = alloca { i32, i32 }

diff  --git a/llvm/test/Transforms/SROA/preserve-nonnull.ll b/llvm/test/Transforms/SROA/preserve-nonnull.ll
index dcd50fb882ecc..81b23cb93dc4b 100644
--- a/llvm/test/Transforms/SROA/preserve-nonnull.ll
+++ b/llvm/test/Transforms/SROA/preserve-nonnull.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 ;
 ; Make sure that SROA doesn't lose nonnull metadata
@@ -8,14 +7,13 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture r
 
 ; Check that we do basic propagation of nonnull when rewriting.
 define i8* @propagate_nonnull(i32* %v) {
-; CHECK-LABEL: @propagate_nonnull(
+; CHECK-LABEL: define i8* @propagate_nonnull(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_1:%.*]] = alloca i8*, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[V:%.*]] to i8*
-; CHECK-NEXT:    store i8* [[TMP0]], i8** [[A_SROA_1]], align 8
-; CHECK-NEXT:    [[A_SROA_1_0_A_SROA_1_8_LOAD:%.*]] = load volatile i8*, i8** [[A_SROA_1]], align 8, !nonnull !0
-; CHECK-NEXT:    ret i8* [[A_SROA_1_0_A_SROA_1_8_LOAD]]
-;
+; CHECK-NEXT:    %[[A:.*]] = alloca i8*
+; CHECK-NEXT:    %[[V_CAST:.*]] = bitcast i32* %v to i8*
+; CHECK-NEXT:    store i8* %[[V_CAST]], i8** %[[A]]
+; CHECK-NEXT:    %[[LOAD:.*]] = load volatile i8*, i8** %[[A]], align 8, !nonnull !0
+; CHECK-NEXT:    ret i8* %[[LOAD]]
 entry:
   %a = alloca [2 x i8*]
   %a.gep0 = getelementptr [2 x i8*], [2 x i8*]* %a, i32 0, i32 0
@@ -29,13 +27,12 @@ entry:
 }
 
 define float* @turn_nonnull_into_assume(float** %arg) {
-; CHECK-LABEL: @turn_nonnull_into_assume(
+; CHECK-LABEL: define float* @turn_nonnull_into_assume(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BUF_0_COPYLOAD:%.*]] = load float*, float** [[ARG:%.*]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne float* [[BUF_0_COPYLOAD]], null
-; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP0]])
-; CHECK-NEXT:    ret float* [[BUF_0_COPYLOAD]]
-;
+; CHECK-NEXT:    %[[RETURN:.*]] = load float*, float** %arg, align 8
+; CHECK-NEXT:    %[[ASSUME:.*]] = icmp ne float* %[[RETURN]], null
+; CHECK-NEXT:    call void @llvm.assume(i1 %[[ASSUME]])
+; CHECK-NEXT:    ret float* %[[RETURN]]
 entry:
   %buf = alloca float*
   %_arg_i8 = bitcast float** %arg to i8*
@@ -52,13 +49,12 @@ entry:
 ; *does* initially, but then we lose that !range metadata before we finish
 ; SROA.
 define i8* @propagate_nonnull_to_int() {
-; CHECK-LABEL: @propagate_nonnull_to_int(
+; CHECK-LABEL: define i8* @propagate_nonnull_to_int(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_1:%.*]] = alloca i8*, align 8
-; CHECK-NEXT:    store i8* inttoptr (i64 42 to i8*), i8** [[A_SROA_1]], align 8
-; CHECK-NEXT:    [[A_SROA_1_0_A_SROA_1_8_LOAD:%.*]] = load volatile i8*, i8** [[A_SROA_1]], align 8, !nonnull !0
-; CHECK-NEXT:    ret i8* [[A_SROA_1_0_A_SROA_1_8_LOAD]]
-;
+; CHECK-NEXT:    %[[A:.*]] = alloca i8*
+; CHECK-NEXT:    store i8* inttoptr (i64 42 to i8*), i8** %[[A]]
+; CHECK-NEXT:    %[[LOAD:.*]] = load volatile i8*, i8** %[[A]]
+; CHECK-NEXT:    ret i8* %[[LOAD]]
 entry:
   %a = alloca [2 x i8*]
   %a.gep0 = getelementptr [2 x i8*], [2 x i8*]* %a, i32 0, i32 0
@@ -76,10 +72,9 @@ entry:
 ; register. This can fail in interesting ways due to the rewrite iteration of
 ; SROA, resulting in PR32902.
 define i8* @propagate_nonnull_to_int_and_promote() {
-; CHECK-LABEL: @propagate_nonnull_to_int_and_promote(
+; CHECK-LABEL: define i8* @propagate_nonnull_to_int_and_promote(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret i8* inttoptr (i64 42 to i8*)
-;
 entry:
   %a = alloca [2 x i8*], align 8
   %a.gep0 = getelementptr [2 x i8*], [2 x i8*]* %a, i32 0, i32 0

diff  --git a/llvm/test/Transforms/SROA/slice-order-independence.ll b/llvm/test/Transforms/SROA/slice-order-independence.ll
index 7126662baa747..2e06e5dd1f779 100644
--- a/llvm/test/Transforms/SROA/slice-order-independence.ll
+++ b/llvm/test/Transforms/SROA/slice-order-independence.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
@@ -8,19 +7,8 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) n
 ; slices even in case of types that are skipped because their width is not a
 ; byte width multiple
 define void @skipped_inttype_first({ i16*, i32 }*) {
-; CHECK-LABEL: @skipped_inttype_first(
-; CHECK-NEXT:    [[ARG_SROA_0:%.*]] = alloca i8*, align 8
-; CHECK-NEXT:    [[ARG_SROA_0_0__SROA_CAST:%.*]] = bitcast { i16*, i32 }* [[TMP0:%.*]] to i8**
-; CHECK-NEXT:    [[ARG_SROA_0_0_COPYLOAD:%.*]] = load i8*, i8** [[ARG_SROA_0_0__SROA_CAST]], align 8
-; CHECK-NEXT:    store i8* [[ARG_SROA_0_0_COPYLOAD]], i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT:    [[ARG_SROA_3_0__SROA_IDX1:%.*]] = getelementptr inbounds { i16*, i32 }, { i16*, i32 }* [[TMP0]], i64 0, i32 1
-; CHECK-NEXT:    [[ARG_SROA_3_0__SROA_CAST:%.*]] = bitcast i32* [[ARG_SROA_3_0__SROA_IDX1]] to i64*
-; CHECK-NEXT:    [[ARG_SROA_3_0_COPYLOAD:%.*]] = load i64, i64* [[ARG_SROA_3_0__SROA_CAST]], align 8
-; CHECK-NEXT:    [[ARG_SROA_0_0_PB0_SROA_CAST2:%.*]] = bitcast i8** [[ARG_SROA_0]] to i63*
-; CHECK-NEXT:    [[ARG_SROA_0_0_ARG_SROA_0_0_B0:%.*]] = load i63, i63* [[ARG_SROA_0_0_PB0_SROA_CAST2]], align 8
-; CHECK-NEXT:    [[ARG_SROA_0_0_ARG_SROA_0_0_B1:%.*]] = load i8*, i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @skipped_inttype_first
+; CHECK: alloca i8*
   %arg = alloca { i16*, i32 }, align 8
   %2 = bitcast { i16*, i32 }* %0 to i8*
   %3 = bitcast { i16*, i32 }* %arg to i8*
@@ -34,19 +22,8 @@ define void @skipped_inttype_first({ i16*, i32 }*) {
 }
 
 define void @skipped_inttype_last({ i16*, i32 }*) {
-; CHECK-LABEL: @skipped_inttype_last(
-; CHECK-NEXT:    [[ARG_SROA_0:%.*]] = alloca i8*, align 8
-; CHECK-NEXT:    [[ARG_SROA_0_0__SROA_CAST:%.*]] = bitcast { i16*, i32 }* [[TMP0:%.*]] to i8**
-; CHECK-NEXT:    [[ARG_SROA_0_0_COPYLOAD:%.*]] = load i8*, i8** [[ARG_SROA_0_0__SROA_CAST]], align 8
-; CHECK-NEXT:    store i8* [[ARG_SROA_0_0_COPYLOAD]], i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT:    [[ARG_SROA_3_0__SROA_IDX1:%.*]] = getelementptr inbounds { i16*, i32 }, { i16*, i32 }* [[TMP0]], i64 0, i32 1
-; CHECK-NEXT:    [[ARG_SROA_3_0__SROA_CAST:%.*]] = bitcast i32* [[ARG_SROA_3_0__SROA_IDX1]] to i64*
-; CHECK-NEXT:    [[ARG_SROA_3_0_COPYLOAD:%.*]] = load i64, i64* [[ARG_SROA_3_0__SROA_CAST]], align 8
-; CHECK-NEXT:    [[ARG_SROA_0_0_ARG_SROA_0_0_B1:%.*]] = load i8*, i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT:    [[ARG_SROA_0_0_PB0_SROA_CAST2:%.*]] = bitcast i8** [[ARG_SROA_0]] to i63*
-; CHECK-NEXT:    [[ARG_SROA_0_0_ARG_SROA_0_0_B0:%.*]] = load i63, i63* [[ARG_SROA_0_0_PB0_SROA_CAST2]], align 8
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: @skipped_inttype_last
+; CHECK: alloca i8*
   %arg = alloca { i16*, i32 }, align 8
   %2 = bitcast { i16*, i32 }* %0 to i8*
   %3 = bitcast { i16*, i32 }* %arg to i8*

diff  --git a/llvm/test/Transforms/SROA/vector-conversion.ll b/llvm/test/Transforms/SROA/vector-conversion.ll
index b57cb027643b6..ae3e4dc520309 100644
--- a/llvm/test/Transforms/SROA/vector-conversion.ll
+++ b/llvm/test/Transforms/SROA/vector-conversion.ll
@@ -1,105 +1,91 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 define <4 x i64> @vector_ptrtoint({<2 x i32*>, <2 x i32*>} %x) {
-; CHECK-LABEL: @vector_ptrtoint(
-; CHECK-NEXT:    [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32*>, <2 x i32*> } [[X:%.*]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x i32*> [[X_FCA_0_EXTRACT]] to <2 x i64>
-; CHECK-NEXT:    [[A_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[A_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> [[A_SROA_0_0_VEC_EXPAND]], <4 x i64> undef
-; CHECK-NEXT:    [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32*>, <2 x i32*> } [[X]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x i32*> [[X_FCA_1_EXTRACT]] to <2 x i64>
-; CHECK-NEXT:    [[A_SROA_0_16_VEC_EXPAND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
-; CHECK-NEXT:    [[A_SROA_0_16_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i64> [[A_SROA_0_16_VEC_EXPAND]], <4 x i64> [[A_SROA_0_0_VECBLEND]]
-; CHECK-NEXT:    ret <4 x i64> [[A_SROA_0_16_VECBLEND]]
-;
+; CHECK-LABEL: @vector_ptrtoint
   %a = alloca {<2 x i32*>, <2 x i32*>}
+; CHECK-NOT: alloca
 
   store {<2 x i32*>, <2 x i32*>} %x, {<2 x i32*>, <2 x i32*>}* %a
+; CHECK-NOT: store
 
   %cast = bitcast {<2 x i32*>, <2 x i32*>}* %a to <4 x i64>*
   %vec = load <4 x i64>, <4 x i64>* %cast
+; CHECK-NOT: load
+; CHECK: ptrtoint
 
   ret <4 x i64> %vec
 }
 
 define <4 x i32*> @vector_inttoptr({<2 x i64>, <2 x i64>} %x) {
-; CHECK-LABEL: @vector_inttoptr(
-; CHECK-NEXT:    [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[X:%.*]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr <2 x i64> [[X_FCA_0_EXTRACT]] to <2 x i32*>
-; CHECK-NEXT:    [[A_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32*> [[TMP1]], <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[A_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32*> [[A_SROA_0_0_VEC_EXPAND]], <4 x i32*> undef
-; CHECK-NEXT:    [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[X]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <2 x i64> [[X_FCA_1_EXTRACT]] to <2 x i32*>
-; CHECK-NEXT:    [[A_SROA_0_16_VEC_EXPAND:%.*]] = shufflevector <2 x i32*> [[TMP2]], <2 x i32*> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
-; CHECK-NEXT:    [[A_SROA_0_16_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32*> [[A_SROA_0_16_VEC_EXPAND]], <4 x i32*> [[A_SROA_0_0_VECBLEND]]
-; CHECK-NEXT:    ret <4 x i32*> [[A_SROA_0_16_VECBLEND]]
-;
+; CHECK-LABEL: @vector_inttoptr
   %a = alloca {<2 x i64>, <2 x i64>}
+; CHECK-NOT: alloca
 
   store {<2 x i64>, <2 x i64>} %x, {<2 x i64>, <2 x i64>}* %a
+; CHECK-NOT: store
 
   %cast = bitcast {<2 x i64>, <2 x i64>}* %a to <4 x i32*>*
   %vec = load <4 x i32*>, <4 x i32*>* %cast
+; CHECK-NOT: load
+; CHECK: inttoptr
 
   ret <4 x i32*> %vec
 }
 
 define <2 x i64> @vector_ptrtointbitcast({<1 x i32*>, <1 x i32*>} %x) {
 ; CHECK-LABEL: @vector_ptrtointbitcast(
-; CHECK-NEXT:    [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i32*>, <1 x i32*> } [[X:%.*]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <1 x i32*> [[X_FCA_0_EXTRACT]] to <1 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[A_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i32*>, <1 x i32*> } [[X]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint <1 x i32*> [[X_FCA_1_EXTRACT]] to <1 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[A_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_SROA_0_0_VEC_INSERT]], i64 [[TMP4]], i32 1
-; CHECK-NEXT:    ret <2 x i64> [[A_SROA_0_8_VEC_INSERT]]
-;
   %a = alloca {<1 x i32*>, <1 x i32*>}
+; CHECK-NOT: alloca
 
   store {<1 x i32*>, <1 x i32*>} %x, {<1 x i32*>, <1 x i32*>}* %a
+; CHECK-NOT: store
 
   %cast = bitcast {<1 x i32*>, <1 x i32*>}* %a to <2 x i64>*
   %vec = load <2 x i64>, <2 x i64>* %cast
+; CHECK-NOT: load
+; CHECK: ptrtoint
+; CHECK: bitcast
+; CHECK: ptrtoint
+; CHECK: bitcast
 
   ret <2 x i64> %vec
 }
 
 define <2 x i8*> @vector_inttoptrbitcast_vector({<16 x i8>, <16 x i8>} %x) {
 ; CHECK-LABEL: @vector_inttoptrbitcast_vector(
-; CHECK-NEXT:    [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[X:%.*]], 0
-; CHECK-NEXT:    [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[X]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[X_FCA_0_EXTRACT]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x i8*>
-; CHECK-NEXT:    ret <2 x i8*> [[TMP2]]
-;
   %a = alloca {<16 x i8>, <16 x i8>}
+; CHECK-NOT: alloca
 
   store {<16 x i8>, <16 x i8>} %x, {<16 x i8>, <16 x i8>}* %a
+; CHECK-NOT: store
 
   %cast = bitcast {<16 x i8>, <16 x i8>}* %a to <2 x i8*>*
   %vec = load <2 x i8*>, <2 x i8*>* %cast
+; CHECK-NOT: load
+; CHECK: extractvalue
+; CHECK: extractvalue
+; CHECK: bitcast
+; CHECK: inttoptr
 
   ret <2 x i8*> %vec
 }
 
 define <16 x i8> @vector_ptrtointbitcast_vector({<2 x i8*>, <2 x i8*>} %x) {
 ; CHECK-LABEL: @vector_ptrtointbitcast_vector(
-; CHECK-NEXT:    [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i8*>, <2 x i8*> } [[X:%.*]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x i8*> [[X_FCA_0_EXTRACT]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-; CHECK-NEXT:    [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i8*>, <2 x i8*> } [[X]], 1
-; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
-;
   %a = alloca {<2 x i8*>, <2 x i8*>}
+; CHECK-NOT: alloca
 
   store {<2 x i8*>, <2 x i8*>} %x, {<2 x i8*>, <2 x i8*>}* %a
+; CHECK-NOT: store
 
   %cast = bitcast {<2 x i8*>, <2 x i8*>}* %a to <16 x i8>*
   %vec = load <16 x i8>, <16 x i8>* %cast
+; CHECK-NOT: load
+; CHECK: extractvalue
+; CHECK: ptrtoint
+; CHECK: bitcast
+; CHECK: extractvalue
 
   ret <16 x i8> %vec
 }

diff  --git a/llvm/test/Transforms/SROA/vector-promotion-
diff erent-size.ll b/llvm/test/Transforms/SROA/vector-promotion-
diff erent-size.ll
index 4a6efe8e9aea5..ff7a5319f2dbe 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-
diff erent-size.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-
diff erent-size.ll
@@ -1,35 +1,24 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 define <4 x i1> @vector_bitcast() {
-; CHECK-LABEL: @vector_bitcast(
-; CHECK-NEXT:    [[A:%.*]] = alloca <3 x i1>, align 1
-; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, <3 x i1>* [[A]], align 1
-; CHECK-NEXT:    [[A_0_CAST_SROA_CAST:%.*]] = bitcast <3 x i1>* [[A]] to <4 x i1>*
-; CHECK-NEXT:    [[A_0_VEC:%.*]] = load <4 x i1>, <4 x i1>* [[A_0_CAST_SROA_CAST]], align 1
-; CHECK-NEXT:    ret <4 x i1> [[A_0_VEC]]
-;
+  ; CHECK-LABEL: @vector_bitcast
+  ; CHECK: alloca <3 x i1>
 
-  %a = alloca <3 x i1>
-  store <3 x i1> <i1 1,i1 0,i1 1>, <3 x i1>* %a
-  %cast = bitcast <3 x i1>* %a to <4 x i1>*
-  %vec = load <4 x i1>, <4 x i1>* %cast
-  ret <4 x i1> %vec
+    %a = alloca <3 x i1>
+    store <3 x i1> <i1 1,i1 0,i1 1>, <3 x i1>* %a
+    %cast = bitcast <3 x i1>* %a to <4 x i1>*
+    %vec = load <4 x i1>, <4 x i1>* %cast
+    ret <4 x i1> %vec
 }
 
 define void @vector_bitcast_2() {
-; CHECK-LABEL: @vector_bitcast_2(
-; CHECK-NEXT:    %"sum$1.host2" = alloca <32 x i16>, align 64
-; CHECK-NEXT:    store <32 x i16> undef, <32 x i16>* %"sum$1.host2", align 64
-; CHECK-NEXT:    %"sum$1.host2.0.bc.sroa_cast" = bitcast <32 x i16>* %"sum$1.host2" to <64 x i16>*
-; CHECK-NEXT:    %"sum$1.host2.0.bcl" = load <64 x i16>, <64 x i16>* %"sum$1.host2.0.bc.sroa_cast", align 64
-; CHECK-NEXT:    ret void
-;
+  ; CHECK-LABEL: @vector_bitcast_2
+  ; CHECK: alloca <32 x i16>
 
-  %"sum$1.host2" = alloca <32 x i16>
-  store <32 x i16> undef, <32 x i16>* %"sum$1.host2"
-  %bc = bitcast <32 x i16>* %"sum$1.host2" to <64 x i16>*
-  %bcl = load <64 x i16>, <64 x i16>* %bc
-  ret void
+    %"sum$1.host2" = alloca <32 x i16>
+    store <32 x i16> undef, <32 x i16>* %"sum$1.host2"
+    %bc = bitcast <32 x i16>* %"sum$1.host2" to <64 x i16>*
+    %bcl = load <64 x i16>, <64 x i16>* %bc
+    ret void
 }


        


More information about the llvm-commits mailing list